diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 2188c65aa0b..9655ebf552a 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,67 +1,13 @@
-megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo
-
-megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt
-
-megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal
-
-megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
-megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
-
-megatron/core/models/hybrid/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-model
-
-megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
-
-megatron/core/tokenizers/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/tokenizers
-
-megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
-
-megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
-
-megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing
-
-megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer
-
-megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference
-
-megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
-
-megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism
-
-megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/transformer
-
-megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech
-
-megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference
-
-megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo
-
-megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
-
-megatron/post_training/ @NVIDIA/post-training
-
-megatron/core/transformer/cuda_graphs.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/cuda-graphs
-
-megatron/training/ @NVIDIA/training-adlr @NVIDIA/training-nemo
-megatron/training/arguments.py
+* @NVIDIA/core-nemo @NVIDIA/core-devtech
 
 .gitlab/ @NVIDIA/ci
 .github/ @NVIDIA/ci
 .github/oncall_schedule.json @NVIDIA/mcore-oncall-rotation
 .gitlab-ci.yml @NVIDIA/ci
 docker/  @NVIDIA/ci
+tests/unit_tests/run_ci_test.sh @NVIDIA/ci
+tests/test_utils/python_scripts/
 tests/functional_tests/python_test_utils/ @NVIDIA/ci
 tests/functional_tests/shell_test_utils/ @NVIDIA/ci
-tests/test_utils/recipes/ @NVIDIA/ci
-tests/unit_tests/run_ci_test.sh @NVIDIA/ci
-
-# API Backwards Compatibility Check
-scripts/check_api_backwards_compatibility.py @NVIDIA/ci
-scripts/README_API_COMPAT.md @NVIDIA/ci
-.github/workflows/check_api_backwards_compatibility_workflow.yml @NVIDIA/ci
-docs/api-backwards-compatibility-check.md @NVIDIA/ci
-tests/unit_tests/test_api_backwards_compat_setup.py @NVIDIA/ci
-
-megatron/rl/ @NVIDIA/reinforcement-learning
-examples/rl/ @NVIDIA/reinforcement-learning
-test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
-train_rl.py @NVIDIA/reinforcement-learning
+pyproject.toml @NVIDIA/ci
+uv.lock @NVIDIA/ci
diff --git a/.github/scripts/sync_team_usergroups.py b/.github/scripts/sync_team_usergroups.py
index c5f40f5fe33..01ef49c9e0a 100644
--- a/.github/scripts/sync_team_usergroups.py
+++ b/.github/scripts/sync_team_usergroups.py
@@ -19,12 +19,12 @@
 Slack user groups to match.
 """
 
+import argparse
 import os
 import re
 import sys
-import argparse
-import requests
 
+import requests
 from slack_sdk import WebClient
 from slack_sdk.errors import SlackApiError
 
@@ -53,10 +53,7 @@ def get_headers():
         print("Error: GH_TOKEN or GITHUB_TOKEN not set")
         sys.exit(1)
 
-    return {
-        "Authorization": f"token {token}",
-        "Accept": "application/vnd.github.v3+json",
-    }
+    return {"Authorization": f"token {token}", "Accept": "application/vnd.github.v3+json"}
 
 
 def get_org():
@@ -215,9 +212,7 @@ def get_user_email(username):
 
                 # Check Signed-off-by lines in the commit message for @nvidia.com emails
                 message = commit_data.get('message', '')
-                sob_matches = re.findall(
-                    r'Signed-off-by:.*<([^>]+@nvidia\.com)>', message
-                )
+                sob_matches = re.findall(r'Signed-off-by:.*<([^>]+@nvidia\.com)>', message)
                 if sob_matches:
                     _email_cache[username] = sob_matches[0]
                     print(f"Found @nvidia.com email for {username} from Signed-off-by")
@@ -339,21 +334,14 @@ def create_slack_usergroup(slack_client, handle, team_slug):
 
     try:
         print(f"Creating Slack usergroup '@{handle}' with name '{name}'...")
-        response = slack_client.usergroups_create(
-            name=name,
-            handle=handle,
-            description=description,
-        )
+        response = slack_client.usergroups_create(name=name, handle=handle, description=description)
         usergroup = response.get("usergroup", {})
         usergroup_id = usergroup.get("id")
 
         if usergroup_id:
             # Update cache with new usergroup
             if _usergroups_cache is not None:
-                _usergroups_cache[handle] = {
-                    "id": usergroup_id,
-                    "users": [],
-                }
+                _usergroups_cache[handle] = {"id": usergroup_id, "users": []}
             print(f"Successfully created Slack usergroup '@{handle}'")
             return usergroup_id
         else:
@@ -446,9 +434,7 @@ def sync_team_to_usergroup(team_slug, usergroup_handle, dry_run=False):
 
     # 5. Update the usergroup
     try:
-        slack_client.usergroups_users_update(
-            usergroup=usergroup_id, users=slack_user_ids
-        )
+        slack_client.usergroups_users_update(usergroup=usergroup_id, users=slack_user_ids)
         print(f"\nSuccessfully updated '@{usergroup_handle}' with {len(slack_user_ids)} members")
         return True
     except SlackApiError as e:
@@ -530,18 +516,12 @@ def sync_all_teams(dry_run=False, parent_teams=None, direct_teams=None):
 
 
 def main():
-    parser = argparse.ArgumentParser(
-        description="Sync GitHub team membership to Slack user groups"
-    )
+    parser = argparse.ArgumentParser(description="Sync GitHub team membership to Slack user groups")
     parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        help="Show what would be done without making changes",
+        "--dry-run", action="store_true", help="Show what would be done without making changes"
     )
     parser.add_argument(
-        "--list",
-        action="store_true",
-        help="List all configured team-to-usergroup mappings",
+        "--list", action="store_true", help="List all configured team-to-usergroup mappings"
     )
     parser.add_argument(
         "--parent-team",
@@ -559,8 +539,7 @@ def main():
         dest="direct_teams",
         metavar="SLUG",
         help=(
-            "Sync this GitHub team directly (can be repeated). "
-            f"Defaults to: {DIRECT_TEAM_SLUGS}"
+            "Sync this GitHub team directly (can be repeated). " f"Defaults to: {DIRECT_TEAM_SLUGS}"
         ),
     )
 
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 1e62f6b3016..35eb570296d 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -78,8 +78,8 @@ jobs:
           IS_MERGE_GROUP: ${{ github.event_name == 'merge_group' }}
           SCHEDULED_JOB: ${{ github.event_name == 'schedule' }}
         run: |
-          # Skip SSO check for scheduled jobs, main branch, or merge groups
-          if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
+          # Skip SSO check for scheduled jobs, main branch, dev branch, or merge groups
+          if [ "${{ env.SCHEDULED_JOB }}" == "true" ] || [ "${IS_MAIN_BRANCH}" == "true" ] || [ "${IS_DEV_BRANCH}" == "true" ] || [ "${IS_MERGE_GROUP}" == "true" ]; then
             echo "is_maintainer=true" | tee -a $GITHUB_OUTPUT
             exit 0
           fi
diff --git a/.github/workflows/mirror-to-main.yml b/.github/workflows/mirror-to-main.yml
new file mode 100644
index 00000000000..cb77851942b
--- /dev/null
+++ b/.github/workflows/mirror-to-main.yml
@@ -0,0 +1,129 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+name: Mirror Dev to Main
+
+on:
+  push:
+    branches:
+      - "pull-request/[0-9]+"
+
+jobs:
+  cherry-pick-to-main:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.PAT }}
+
+      - name: Get PR info
+        id: get-pr-info
+        uses: nv-gha-runners/get-pr-info@main
+
+      - name: Configure Git
+        run: |
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+          git config --global user.name "GitHub Actions Bot"
+
+      - name: Cherry-pick to main
+        env:
+          GH_TOKEN: ${{ secrets.PAT }}
+        run: |
+          set -x
+
+          PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
+          BASE_REF="${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.ref }}"
+          HAS_MIRROR_MAIN_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "mirror-to-main")' || echo "false")
+          TARGET_BRANCH="cherry-pick-$PR_NUMBER-into-main"
+
+          # Skip if not labeled with mirror-to-main
+          if [ "$HAS_MIRROR_MAIN_LABEL" != "true" ]; then
+            echo "PR is not labeled with mirror-to-main, will not mirror to main."
+            exit 0
+          fi
+
+          # Skip if not targeting dev
+          if [ "$BASE_REF" != "dev" ]; then
+            echo "PR is not targeting dev, will not mirror to main."
+            exit 0
+          fi
+
+          # Check if target branch already exists
+          if git ls-remote --heads origin "refs/heads/$TARGET_BRANCH" | grep -q .; then
+            echo "Target branch already exists, will not cherry-pick again."
+            exit 0
+          fi
+
+          # Get PR details
+          PR_AUTHOR="${{ fromJSON(steps.get-pr-info.outputs.pr-info).user.login }}"
+          PR_TITLE="${{ fromJSON(steps.get-pr-info.outputs.pr-info).title }}"
+          SOURCE_BRANCH="${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.ref }}"
+          SOURCE_REPO="${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.repo.full_name }}"
+
+          # Fetch all branches
+          git fetch origin dev
+
+          # Handle forks vs same repo
+          if [ "$SOURCE_REPO" = "${{ github.repository }}" ]; then
+            git fetch origin "$SOURCE_BRANCH"
+            git checkout "$SOURCE_BRANCH"
+          else
+            git fetch "https://github.com/$SOURCE_REPO.git" "$SOURCE_BRANCH"
+            git checkout FETCH_HEAD
+          fi
+
+          # Find commit range to cherry-pick
+          START_COMMIT=$(git merge-base origin/dev HEAD)
+          END_COMMIT=$(git rev-parse HEAD)
+
+          # Create cherry-pick branch from main
+          git fetch origin main
+          git checkout main
+          git checkout -b "$TARGET_BRANCH"
+
+          # Cherry-pick commits
+          if ! git cherry-pick "$START_COMMIT..$END_COMMIT"; then
+            # Comment on the original PR about the failure
+            COMMENT_BODY=$(cat <<'EOF'
+          ❌ **Cherry-pick to main failed**
+
+          The cherry-pick encountered conflicts and could not be completed automatically.
+
+          **Next steps:**
+          1. Manually create a PR with these changes to main
+          2. Resolve any conflicts
+          EOF
+            )
+
+            gh pr comment $PR_NUMBER --body "$COMMENT_BODY"
+            exit 1
+          fi
+
+          # Push branch
+          git push -u origin "$TARGET_BRANCH"
+
+          # Create PR to main
+          gh pr create \
+            --base main \
+            --head "$TARGET_BRANCH" \
+            --title "cp: \`$PR_TITLE ($PR_NUMBER)\` into \`main\`" \
+            --body "[🤖]: Hi @$PR_AUTHOR 👋<br><br>We've cherry-picked \`$PR_TITLE (#$PR_NUMBER)\` into \`main\` for you! 🚀<br><br>Please review and approve this cherry-pick at your convenience!" \
+            --label "cherry-pick" \
+            --reviewer "$PR_AUTHOR"
+
diff --git a/.github/workflows/multi-approval-bot.yml b/.github/workflows/multi-approval-bot.yml
deleted file mode 100644
index c7477679201..00000000000
--- a/.github/workflows/multi-approval-bot.yml
+++ /dev/null
@@ -1,74 +0,0 @@
-name: "Codeowners Approval Workflow"
-
-on:
-  push:
-    branches:
-      - "pull-request/[0-9]+"
-  merge_group:
-    types: [checks_requested]
-
-jobs:
-  pre-flight:
-    uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.73.2
-    if: github.repository == 'NVIDIA/Megatron-LM'
-
-  codeowners-approval:
-    needs: [pre-flight]
-    runs-on: ubuntu-latest
-    if: |
-      !(needs.pre-flight.outputs.docs_only == 'true'
-      || needs.pre-flight.outputs.is_merge_group == 'true'
-      || needs.pre-flight.outputs.is_deployment_workflow == 'true')
-    steps:
-      - name: Get PR info
-        id: get-pr-info
-        if: startsWith(github.ref, 'refs/heads/pull-request/')
-        uses: nv-gha-runners/get-pr-info@main
-
-      - name: Checkout action
-        uses: actions/checkout@v6
-        with:
-          repository: noamelf/codeowner-multi-approval-action
-          ref: v0.1
-          path: codeowner-multi-approval-action
-
-      - name: Check Codeowners Approval
-        uses: ./codeowner-multi-approval-action
-        with:
-          pr-number: ${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
-          repo-name: ${{ github.repository }}
-          github-token: ${{ secrets.PAT }}
-
-  multi-approval-bot-summary:
-    needs: [pre-flight, codeowners-approval]
-    if: |
-      (
-        needs.pre-flight.outputs.docs_only == 'true'
-        || needs.pre-flight.outputs.is_merge_group == 'true'
-        || needs.pre-flight.outputs.is_deployment_workflow == 'true'
-        || always()
-      )
-      && github.repository == 'NVIDIA/Megatron-LM'
-      && !cancelled()
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v6
-
-      - name: Result
-        env:
-          GH_TOKEN: ${{ github.token }}
-          GITHUB_RUN_ID: ${{ github.run_id }}
-          SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
-        run: |
-          FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
-
-          if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
-              echo "✅ All previous jobs completed successfully"
-              exit 0
-          else
-              echo "❌ Found $FAILED_JOBS failed job(s)"
-              # Show which jobs failed
-              gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
-              exit 1
-          fi
diff --git a/.gitlab/stages/00.pre.yml b/.gitlab/stages/00.pre.yml
index 1d35494dcd6..e00ce8afc36 100644
--- a/.gitlab/stages/00.pre.yml
+++ b/.gitlab/stages/00.pre.yml
@@ -71,6 +71,7 @@ pre:create_ci_branches_dev:
       - branch: ci-dev-rebuild-mcore-nemo-image
       - branch: ci-dev-mr
       - branch: ci-dev-nightly
+      - branch: ci-dev-weekly
       - branch: ci-dev-upgrade-dependencies
   tags:
     - arch/amd64
diff --git a/.gitlab/stages/04.functional-tests.yml b/.gitlab/stages/04.functional-tests.yml
index 55c4d740659..002c96e7c0f 100644
--- a/.gitlab/stages/04.functional-tests.yml
+++ b/.gitlab/stages/04.functional-tests.yml
@@ -255,7 +255,7 @@ functional:x_notify:
     - export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
     - export GITLAB_ENDPOINT
     - export CONTEXT=$FUNCTIONAL_TEST_SCOPE
-    - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0")
+    - export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]] && echo "1" || "0")
     - export TEAM_SLUG=$SLACK_ADMIN
     - |
       python tests/test_utils/python_scripts/notify.py \
@@ -269,7 +269,7 @@ functional:x_notify:
     paths:
       - scripts
   rules:
-    - if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main") && $FUNCTIONAL_TEST == "yes"
+    - if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev") && $FUNCTIONAL_TEST == "yes"
       when: always
     - when: never
 
diff --git a/README.md b/README.md
index 9a62f9bb750..b22a8d0e8f6 100644
--- a/README.md
+++ b/README.md
@@ -1,21 +1,31 @@
 <div align="center">
 
-Megatron-LM and Megatron Core
-=============================
+Megatron-LM & Megatron Core
+===========================
 
 <h4>GPU-optimized library for training transformer models at scale</h4>
 
-[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html)
-[![version](https://img.shields.io/badge/release-0.15.0-green)](./CHANGELOG.md)
+[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://docs.nvidia.com/Megatron-Core/developer-guide/latest/index.html)
+[![version](https://img.shields.io/badge/release-0.12.0-green)](./CHANGELOG.md)
 [![license](https://img.shields.io/badge/license-Apache-blue)](./LICENSE)
 
 <div align="left">
 
-## About
+> ## 🚨 **DEVELOPMENT BRANCH**
+> ⚠️ **EXPERIMENTAL FEATURES** - This is the **dev branch** with experimental features. 
+>
+> **→ For releases and comprehensive documentation, visit the [main branch](https://github.com/NVIDIA/Megatron-LM)**
 
-This repository contains two components: **Megatron-LM** and **Megatron Core**.
+## ⚡ Quickstart
 
-**Megatron-LM** is a reference example that includes Megatron Core plus pre-configured training scripts. Best for research teams, learning distributed training, and quick experimentation.
+```bash
+# Clone the dev branch
+git clone -b dev https://github.com/NVIDIA/Megatron-LM.git
+cd Megatron-LM
+
+# Install from source with dev dependencies (includes transformer_engine)
+pip install -e .[mlm,dev]
+```
 
 **Megatron Core** is a composable library with GPU-optimized building blocks for custom training frameworks. It provides transformer building blocks, advanced parallelism strategies (TP, PP, DP, EP, CP), mixed precision support (FP16, BF16, FP8, FP4), and model architectures. Best for framework developers and ML engineers building custom training pipelines.
 
@@ -58,14 +68,21 @@ For NGC container setup and all installation options, see the **[Installation Gu
 - **[2025/05]** Megatron Core v0.11.0 brings new capabilities for multi-data center LLM training ([blog](https://developer.nvidia.com/blog/turbocharge-llm-training-across-long-haul-data-center-networks-with-nvidia-nemo-framework/)).
 
 <details>
-<summary>Previous News</summary>
+<summary>Table of Contents</summary>
 
-- **[2024/07]** Megatron Core v0.7 improves scalability and training resiliency and adds support for multimodal training ([blog](https://developer.nvidia.com/blog/train-generative-ai-models-more-efficiently-with-new-nvidia-Megatron-Core-functionalities/)).
-- **[2024/06]** Megatron Core added supports for Mamba-based models. Check out our paper [An Empirical Study of Mamba-based Language Models](https://arxiv.org/pdf/2406.07887) and [code example](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
-- **[2024/01 Announcement]** NVIDIA has released the core capabilities in **Megatron-LM** into [**Megatron Core**](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core) in this repository. Megatron Core expands upon Megatron-LM's GPU-optimized techniques with more cutting-edge innovations on system-level optimizations, featuring composable and modular APIs.
+**Getting Started**
+- [⚡ Quick Start](#-quick-start)
+- [🧠 Dev Branch Philosophy](#-dev-branch-philosophy)
+- [📊 Performance & Benchmarking](#-performance--benchmarking)
+- [👥 Community & Support](#-community--support)
+
+**For Complete Documentation** → [Main Branch](https://github.com/NVIDIA/Megatron-LM) | [Official Docs](https://docs.nvidia.com/Megatron-Core/)
 
 </details>
 
+
+## Dev Branch Philosophy
+
 # Project Structure
 
 ```
@@ -128,17 +145,32 @@ We also strong scaled the standard GPT-3 model (our version has slightly more th
 
 # Roadmaps
 
-- **[MoE Roadmap](https://github.com/NVIDIA/Megatron-LM/issues/1729)** - DeepSeek-V3, Qwen3, advanced parallelism, FP8 optimizations, and Blackwell enhancements
+### Fast Iteration
+- **Streamlined Review**: 1 code owner + 1 dev approver (can delegate review) + CI/CD
+
+### Feature Lifecycle (Coming Soon)
+- **6-Month Timeline**: Experimental features must graduate to stable or be deprecated
+- **Migration Support**: Assistance provided for feature transitions
+
+### Stability Expectations
+- **Experimental Nature**: Features may change or be removed as development progresses
+- **Testing**: All features will pass convergence and performance validation before inclusion
+- **Support**: Dev branch issues should include `[DEV]` prefix
 
 # Resources
 
-## Getting Help
+## Performance & Benchmarking
 
-- 📖 **[Documentation](https://docs.nvidia.com/megatron-core/developer-guide/latest/index.html)** - Official documentation
-- 🐛 **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests
+- 🚀 [2025/11] [Optimizing DeepSeek-V3 Training Performance on NVIDIA GB200 NVL72](docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-optimization.md).
+- ⚡ [2025/11] [A Guide to Reproduce DeepSeek-V3 Pre-training Performance on GB200](docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md).
 
-## Contributing
+## Community & Support
 
+### Getting Help
+- 📖 **[Documentation](https://docs.nvidia.com/Megatron-Core/)** - Official documentation
+- 🐛 **[Issues](https://github.com/NVIDIA/Megatron-LM/issues)** - Bug reports and feature requests
+
+### Contributing
 We ❤️ contributions! Ways to contribute:
 
 - 🐛 **Report bugs** - Help us improve reliability
@@ -146,12 +178,9 @@ We ❤️ contributions! Ways to contribute:
 - 📝 **Improve docs** - Make Megatron Core more accessible
 - 🔧 **Submit PRs** - Contribute code improvements
 
-**→ [Contributing Guide](https://docs.nvidia.com/megatron-core/developer-guide/latest/developer/contribute.html)**
-
-## Citation
-
-If you use Megatron in your research or project, we appreciate that you use the following citations:
+**→ [Contributing Guide](./CONTRIBUTING.md)**
 
+### Citation
 ```bibtex
 @article{megatron-lm,
   title={Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism},
diff --git a/docker/Dockerfile.ci.dev b/docker/Dockerfile.ci.dev
index 5e3d7419b3a..7f3a5c0552a 100644
--- a/docker/Dockerfile.ci.dev
+++ b/docker/Dockerfile.ci.dev
@@ -16,7 +16,7 @@ ENV UV_LINK_MODE=copy
 
 RUN bash -ex <<"EOF"
     apt-get update
-    apt-get install -y --no-install-recommends gettext python3-venv psmisc uuid-runtime
+    apt-get install -y --no-install-recommends gettext python3-venv psmisc uuid-runtime 
     apt-get clean
     python -m venv /opt/jet
     ARCH=$(uname -m)
@@ -31,17 +31,21 @@ RUN bash -ex <<"EOF"
     curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
 EOF
 
+RUN ln -sf /usr/local/cuda/targets/x86_64-linux/include/cuda \
+    /usr/local/include/cuda
+RUN find /usr/local/cuda -name "utility" 2>/dev/null | head -5 && \
+    ls /usr/local/cuda/targets/x86_64-linux/include/ | head -20
+
 COPY README.md pyproject.toml uv.lock /workspace/
 COPY megatron/core/__init__.py /workspace/megatron/core/
 COPY megatron/core/package_info.py /workspace/megatron/core/
 ARG IMAGE_TYPE=dev
-ENV NVTE_BUILD_NUM_PHILOX_ROUNDS=3
 RUN --mount=type=cache,target=/root/.cache/uv \
     bash -ex <<"EOF"
     export NVTE_CUDA_ARCHS="80;90;100"
     uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
     uv sync --only-group build
-    uv sync --extra ${IMAGE_TYPE} --extra mlm --link-mode copy --locked \
+    uv sync --extra ${IMAGE_TYPE} --extra mlm --group no_pypi_wheels --link-mode copy --locked \
         --no-install-package torch \
         --no-install-package torchvision \
         --no-install-package triton \
@@ -71,7 +75,7 @@ RUN bash -ex <<"EOF"
 
     git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git
     pushd DeepEP
-        git checkout 34152ae28f80bcc3ee38d7a12cb2ad87cfd4ea72
+        git checkout eb9cee7de5a24193bf09500668d3a619d3d3f3fb
         patch -p1 < /workspace/deepep.patch
     popd
     TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/.
diff --git a/docs/add_copyright_header.py b/docs/add_copyright_header.py
index 9694ef84819..9bc4481c506 100644
--- a/docs/add_copyright_header.py
+++ b/docs/add_copyright_header.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 #!/usr/bin/env python3
 """One-off script to add NVIDIA copyright header to all .md files under docs/."""
 
diff --git a/docs/conf.py b/docs/conf.py
index 47532648b15..26b618b1eac 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -20,7 +20,6 @@
 import os
 import sys
 
-
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
@@ -84,24 +83,17 @@
     # This is a workaround that uses the parser located in autodoc2_docstrings_parser.py to allow autodoc2 to
     # render google style docstrings.
     # Related Issue: https://github.com/sphinx-extensions2/sphinx-autodoc2/issues/33
-    autodoc2_docstring_parser_regexes = [
-        (r".*", "docs.autodoc2_docstrings_parser"),
-    ]
+    autodoc2_docstring_parser_regexes = [(r".*", "docs.autodoc2_docstrings_parser")]
     # Regex patterns whose values contain raw regex syntax (e.g. \p{L}) that docutils
     # mis-parses as footnote/reference markup. Exclude them from the generated docs.
-    autodoc2_hidden_regexes = [
-        r".*\._PATTERN_TIKTOKEN.*",
-    ]
+    autodoc2_hidden_regexes = [r".*\._PATTERN_TIKTOKEN.*"]
 
 # -- Options for HTML output -------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
 
 html_theme = "nvidia_sphinx_theme"
 html_theme_options = {
-    "switcher": {
-        "json_url": "../versions1.json",
-        "version_match": release,
-    },
+    "switcher": {"json_url": "../versions1.json", "version_match": release},
     "icon_links": [
         {
             "name": "GitHub",
@@ -114,7 +106,4 @@
 html_extra_path = ["project.json", "versions1.json"]
 
 # Github links are now getting rate limited from the Github Actions
-linkcheck_ignore = [
-    ".*github\\.com.*",
-    ".*githubusercontent\\.com.*",
-]
+linkcheck_ignore = [".*github\\.com.*", ".*githubusercontent\\.com.*"]
diff --git a/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md b/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md
new file mode 100644
index 00000000000..8fa3051e479
--- /dev/null
+++ b/docs/discussions/deepseek-v3-gb200-optimization/deepseek-v3-gb200-reproduce-guide.md
@@ -0,0 +1,358 @@
+---
+orphan: true
+---
+
+# A Guide to Reproduce DeepSeek-V3 Pre-training Performance on GB200
+
+## 1. Dockerfile
+
+Requirements:
+- Transformer Engine: We recommend using commit [d2945c6](https://github.com/NVIDIA/TransformerEngine/commit/d2945c6a571e3978677614d1fe08779966a5a4ef) with PR [2146](https://github.com/NVIDIA/TransformerEngine/pull/2146) and [2150](https://github.com/NVIDIA/TransformerEngine/pull/2150). You could prepare the branch by yourself, or use this [branch](https://github.com/hxbai/TransformerEngine/commits/dev_20251024/) based on TE v2.9 plus the above three commits/PRs.
+- cuDNN: v9.14 is required.
+- HybridEP: Install it from [here](https://github.com/deepseek-ai/DeepEP/commits/3f601f7ac1c062c46502646ff04c535013bfca00).
+
+Dockerfile for reference.
+
+```dockerfile
+FROM nvcr.io/nvidia/pytorch:25.09-py3 AS base
+
+ENV SHELL=/bin/bash
+
+# =========================
+# Install system packages
+# =========================
+RUN rm -rf /opt/megatron-lm && \
+    apt-get update && \
+    apt-get install -y sudo gdb bash-builtins git zsh autojump tmux curl gettext libfabric-dev && \
+    wget https://github.com/mikefarah/yq/releases/download/v4.27.5/yq_linux_arm64 -O /usr/bin/yq && \
+    chmod +x /usr/bin/yq
+
+# =========================
+# Install Python packages
+# =========================
+# NOTE: `unset PIP_CONSTRAINT` to install packages that do not meet the default constraint in the base image.
+# Some package requirements and related versions are from 
+#   https://github.com/NVIDIA/Megatron-LM/blob/core_v0.12.0/Dockerfile.linting.
+#   https://github.com/NVIDIA/Megatron-LM/blob/core_v0.12.0/requirements_mlm.txt.
+#   https://github.com/NVIDIA/Megatron-LM/blob/core_v0.12.0/requirements_ci.txt.
+RUN unset PIP_CONSTRAINT && pip install --no-cache-dir debugpy dm-tree torch_tb_profiler einops wandb \
+    sentencepiece tokenizers transformers torchvision ftfy modelcards datasets tqdm pydantic \
+    nvidia-pytriton py-spy yapf darker \
+    tiktoken flask-restful \
+    nltk wrapt pytest pytest_asyncio pytest-cov pytest_mock pytest-random-order \
+    black==24.4.2 isort==5.13.2 flake8==7.1.0 pylint==3.2.6 coverage mypy \
+    setuptools==69.5.1
+
+# =========================
+# Install cudnn 9.14.0.64 for correct mxfp8 quantization and layernorm fusion
+# =========================
+RUN apt-get update && \
+    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/sbsa/cuda-keyring_1.1-1_all.deb && \
+    dpkg -i cuda-keyring_1.1-1_all.deb && \
+    apt-get update && \
+    apt-get -y install libcudnn9-cuda-13
+
+# =========================
+# Install latest TE
+# Use a specific commit instead of main to make it more stable.
+# This is based on release_v2.9 branch and contains some CPU and quantization optimizations.
+# =========================
+ARG COMMIT="7dd3914726abb79bc99ff5a5db1449458ed64151"
+ARG TE="git+https://github.com/hxbai/TransformerEngine.git@${COMMIT}"
+RUN pip install nvidia-mathdx==25.1.1 && \
+    unset PIP_CONSTRAINT && \
+    NVTE_CUDA_ARCHS="100" NVTE_BUILD_THREADS_PER_JOB=8 NVTE_FRAMEWORK=pytorch pip install --no-build-isolation --no-cache-dir $TE
+
+# =========================
+# Install HybridEP
+# =========================
+WORKDIR /home/
+RUN git clone --branch hybrid-ep https://github.com/deepseek-ai/DeepEP.git && \
+    cd DeepEP && git checkout 3f601f7ac1c062c46502646ff04c535013bfca00 && \
+    TORCH_CUDA_ARCH_LIST="10.0" pip install --no-build-isolation .
+
+# =========================
+# Clean cache
+# =========================
+RUN rm -rf /root/.cache /tmp/*
+```
+
+> [!Tip]
+>
+> If you prefer to use CUDA 12.9, please change the base container to `nvcr.io/nvidia/pytorch:25.06-py3` and the cuDNN to be installed to `libcudnn9-cuda-12`. 
+
+## 2. Megatron-Core
+
+We recommend using the [dev branch](https://github.com/NVIDIA/Megatron-LM/tree/dev) after PR [1917](https://github.com/NVIDIA/Megatron-LM/pull/1917).
+
+```bash
+git clone https://github.com/NVIDIA/Megatron-LM.git && \
+cd Megatron-LM &&
+git checkout effebd81f410bc6566fffee6c320b6f8f762e06d
+```
+
+## 3. Cluster Configuration
+
+Since we're using EP 32 on NVL72, it's important to make sure
+
+> [!Important]
+> **Every 32 GB200 GPUs (8 nodes) are in the same NVL domain (or rack)**.
+
+Usually you can make it via your cluster workload manager. Taking Slurm as an example, you could pass `--segment 8` to the sbatch command to ensure that every segment of 8 nodes will be scheduled to a rack.
+
+## 4. Training scripts
+
+### Environment variables
+
+```bash
+CUDA_DEVICE_MAX_CONNECTIONS=1
+NVTE_FWD_LAYERNORM_SM_MARGIN=0
+NVTE_BWD_LAYERNORM_SM_MARGIN=0
+NVLINK_DOMAIN_SIZE=72
+NVTE_ALLOW_NONDETERMINISTIC_ALGO=1
+PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+NCCL_NVLS_ENABLE=0
+NVTE_FUSED_ATTN=1
+NVTE_NORM_FWD_USE_CUDNN=1
+NVTE_NORM_BWD_USE_CUDNN=1
+PYTHONWARNINGS=ignore
+NCCL_DEBUG=VERSION
+NCCL_GRAPH_REGISTER=0
+```
+
+### bindpcie
+
+Download [bindpcie](https://github.com/NVIDIA/mlperf-common/blob/main/client/bindpcie) to your workdir, make it executable, 
+
+```bash
+wget https://raw.githubusercontent.com/NVIDIA/mlperf-common/refs/heads/main/client/bindpcie &&
+chmod 755 bindpcie
+```
+
+and then
+
+> [!Important]
+> **Place it at the beginning of your launch command in every process.**
+
+Taking Slurm as an example, your script should look like
+
+```bash
+#!/bin/bash
+
+#SBATCH [... sbatch args]
+
+srun [... srun args] /path/to/bindpcie /path/to/pretrain_gpt.py [... mcore arguments]
+```
+
+This is a very important step on GB200.
+
+### Launch script
+
+```bash
+/path/to/bindpcie \
+/path/to/megatron-lm/pretrain_gpt.py \
+--distributed-timeout-minutes 60 \
+--tensor-model-parallel-size 1 \
+--pipeline-model-parallel-size 8 \
+--expert-model-parallel-size 32 \
+--context-parallel-size 1 \
+--expert-tensor-parallel-size 1 \
+--use-distributed-optimizer \
+--overlap-grad-reduce \
+--overlap-param-gather \
+--use-mcore-models \
+--sequence-parallel \
+--use-flash-attn \
+--disable-bias-linear \
+--micro-batch-size 1 \
+--global-batch-size 2048 \
+--train-samples 585937500 \
+--exit-duration-in-mins 220 \
+--no-save-optim \
+--no-check-for-nan-in-loss-and-grad \
+--cross-entropy-loss-fusion \
+--cross-entropy-fusion-impl te \
+--manual-gc \
+--manual-gc-interval 10 \
+--enable-experimental \
+--transformer-impl transformer_engine \
+--seq-length 4096 \
+--data-cache-path /path/to/data_cache \
+--tokenizer-type HuggingFaceTokenizer \
+--tokenizer-model unsloth/DeepSeek-V3 \
+--data-path /path/to/data \
+--split 99,1,0 \
+--no-mmap-bin-files \
+--no-create-attention-mask-in-dataloader \
+--num-workers 6 \
+--num-layers 61 \
+--hidden-size 7168 \
+--ffn-hidden-size 18432 \
+--num-attention-heads 128 \
+--kv-channels 128 \
+--max-position-embeddings 4096 \
+--position-embedding-type rope \
+--rotary-base 10000 \
+--make-vocab-size-divisible-by 3232 \
+--normalization RMSNorm \
+--norm-epsilon 1e-6 \
+--swiglu \
+--untie-embeddings-and-output-weights \
+--multi-latent-attention \
+--attention-dropout 0.0 \
+--hidden-dropout 0.0 \
+--clip-grad 1.0 \
+--weight-decay 0.1 \
+--qk-layernorm \
+--lr-decay-samples 584765624 \
+--lr-warmup-samples 1536000 \
+--lr-warmup-init 3.9e-7 \
+--lr 3.9e-6 \
+--min-lr 3.9e-7 \
+--lr-decay-style cosine \
+--adam-beta1 0.9 \
+--adam-beta2 0.95 \
+--num-experts 256 \
+--moe-layer-freq ([0]*3+[1]*58) \
+--moe-ffn-hidden-size 2048 \
+--moe-shared-expert-intermediate-size 2048 \
+--moe-router-load-balancing-type seq_aux_loss \
+--moe-router-topk 8 \
+--moe-grouped-gemm \
+--moe-aux-loss-coeff 1e-4 \
+--moe-router-group-topk 4 \
+--moe-router-num-groups 8 \
+--moe-router-pre-softmax \
+--moe-router-padding-for-quantization \
+--moe-router-topk-scaling-factor 2.5 \
+--moe-router-score-function sigmoid \
+--moe-router-enable-expert-bias \
+--moe-router-bias-update-rate 1e-3 \
+--moe-router-dtype fp32 \
+--moe-permute-fusion \
+--moe-router-fusion \
+--q-lora-rank 1536 \
+--kv-lora-rank 512 \
+--qk-head-dim 128 \
+--qk-pos-emb-head-dim 64 \
+--v-head-dim 128 \
+--rotary-scaling-factor 40 \
+--mscale 1.0 \
+--mscale-all-dim 1.0 \
+--eval-iters 32 \
+--eval-interval 200 \
+--no-load-optim \
+--no-load-rng \
+--auto-detect-ckpt-format \
+--load None \
+--save /path/to/checkpoints \
+--save-interval 500 \
+--dist-ckpt-strictness log_all \
+--init-method-std 0.02 \
+--log-timers-to-tensorboard \
+--log-memory-to-tensorboard \
+--log-validation-ppl-to-tensorboard \
+--log-throughput \
+--log-interval 1 \
+--logging-level 40 \
+--tensorboard-dir /path/to/tensorboard \
+--wandb-project deepseek-v3-benchmarking-v0.15 \
+--wandb-exp-name DeepSeek-V3-TP1PP8EP32CP1VPP4-MBS1GBS2048-v0.15 \
+--bf16 \
+--enable-experimental \
+--recompute-granularity selective \
+--recompute-modules moe_act mlp \
+--cuda-graph-impl transformer_engine \
+--cuda-graph-scope attn moe_router moe_preprocess \
+--te-rng-tracker \
+--pipeline-model-parallel-layout "Et|(tt|)*30L" \
+--moe-router-force-load-balancing \
+--moe-token-dispatcher-type flex \
+--moe-flex-dispatcher-backend hybridep \
+--moe-hybridep-num-sms 32 \
+--fp8-recipe mxfp8 \
+--fp8-format e4m3 \
+--fp8-param-gather \
+--reuse-grad-buf-for-mxfp8-param-ag \
+--use-precision-aware-optimizer \
+--main-grads-dtype fp32 \
+--main-params-dtype fp32 \
+--exp-avg-dtype bf16 \
+--exp-avg-sq-dtype bf16 \
+```
+
+### Explanation of arguments
+
+The following arguments indicate key optimizations.
+
+- Pipeline parallel layout
+
+```bash
+--pipeline-model-parallel-layout "Et|(tt|)*30L"
+```
+
+`E` stands for embedding, `t` for transformer layer, `L` for Loss. So it's interpreted as a total of 32 stages, where the first stage is Embedding + 1 transformer layer, the last stage is Loss, and the middle 30 stages are 2 transformer layers.
+
+- Fine-grained recompute
+
+```bash
+--recompute-granularity selective \
+--recompute-modules moe_act mlp \
+```
+
+- Partial CUDA Graphs
+
+```bash
+--cuda-graph-impl transformer_engine \
+--cuda-graph-scope attn moe_router moe_preprocess \
+--te-rng-tracker \
+```
+
+- Force load balancing for performance benchmark
+
+```bash
+--moe-router-force-load-balancing \
+```
+
+- HybridEP
+
+```bash
+--moe-token-dispatcher-type flex \
+--moe-flex-dispatcher-backend hybridep \
+--moe-hybridep-num-sms 32 \
+```
+
+- MXFP8 recipe
+
+```bash
+--fp8-recipe mxfp8 \
+--fp8-format e4m3 \
+--fp8-param-gather \
+--reuse-grad-buf-for-mxfp8-param-ag \
+```
+
+- BF16 optimizer states
+
+```bash
+--use-precision-aware-optimizer \
+--main-grads-dtype fp32 \
+--main-params-dtype fp32 \
+--exp-avg-dtype bf16 \
+--exp-avg-sq-dtype bf16 \
+```
+
+- Kernel fusions
+
+```bash
+--cross-entropy-loss-fusion \
+--cross-entropy-fusion-impl te \
+--moe-permute-fusion \
+--moe-router-fusion \
+```
+
+- Manual GC to make ranks better synchronized
+
+```bash
+--manual-gc \
+--manual-gc-interval 10 \
+```
diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image1.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image1.png
new file mode 100644
index 00000000000..6e4dad685c4
Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image1.png differ
diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image2.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image2.png
new file mode 100644
index 00000000000..920e3c57f94
Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image2.png differ
diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image3.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image3.png
new file mode 100644
index 00000000000..f606dbfb744
Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image3.png differ
diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image4.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image4.png
new file mode 100644
index 00000000000..04239401edd
Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image4.png differ
diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image5.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image5.png
new file mode 100644
index 00000000000..0128fc7ae45
Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image5.png differ
diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image6.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image6.png
new file mode 100644
index 00000000000..cb2ed2eb9ad
Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image6.png differ
diff --git a/docs/discussions/deepseek-v3-gb200-optimization/images/image7.png b/docs/discussions/deepseek-v3-gb200-optimization/images/image7.png
new file mode 100644
index 00000000000..325d0fd4f52
Binary files /dev/null and b/docs/discussions/deepseek-v3-gb200-optimization/images/image7.png differ
diff --git a/docs/source/api-guide/router_replay.md b/docs/source/api-guide/router_replay.md
new file mode 100644
index 00000000000..b2e043b3065
--- /dev/null
+++ b/docs/source/api-guide/router_replay.md
@@ -0,0 +1,180 @@
+---
+orphan: true
+---
+
+# Design Document: MoE Router Replay Feature
+
+## 1. Overview
+
+This document provides a detailed description of the "Router Replay" feature implemented within the Megatron-LM Core for Mixture-of-Experts (MoE) models.
+
+This feature is designed to enhance determinism and analyzability in MoE model training and inference. It enables the model to load routing decisions from a predefined file and enforce their use during the forward pass, thereby bypassing the real-time routing computation.
+
+## 2. Motivation
+
+*   **Determinism & Reproducibility**: In distributed training, MoE routing decisions can exhibit minor variations due to factors like floating-point precision. By replaying a fixed routing table, the MoE computation path is guaranteed to be identical across runs, which facilitates debugging and reproducing experimental results.
+*   **Performance Profiling**: The router's own computation (e.g., logits calculation, top-k selection) incurs overhead. In replay mode, this part of the computation can be completely skipped, allowing for more precise isolation and profiling of performance bottlenecks within the Expert Layers themselves.
+*   **Debugging Aid**: When issues arise in the model, fixing the routing decisions helps to isolate variables, making it easier to determine whether the problem lies with the routing mechanism or the expert computations.
+
+## 3. Design and Architecture
+
+The design follows the principles of being non-intrusive and on-demand, with the core idea of activating the replay logic only when explicitly requested by the user.
+
+*   **Core Components**:
+    *   `RouterReplay` (located in `megatron/core/transformer/moe/router_replay.py`): A utility class for replaying MoE routing decisions. When enabled via the `enable_routing_replay` flag, a separate instance of `RouterReplay` is created for each MoE layer's router. Each instance is responsible for loading routing data and providing the deterministic routing decisions for its corresponding layer during the forward pass.
+    *   `enable_routing_replay` (located in `megatron/core/transformer/transformer_config.py`): A boolean global configuration flag that serves as the sole entry point for enabling this feature.
+
+*   **Workflow**:
+    The feature supports different modes, such as recording and replaying, controlled by a `RouterReplayAction`.
+
+    1.  **Enabling the Feature**: The user sets `enable_routing_replay` to `True` in the model configuration.
+    2.  **Initialization**: When `enable_routing_replay` is true, each `TopKRouter` creates its own `RouterReplay` instance.
+    3.  **Mode Configuration**: The user must programmatically set the desired router replay action (e.g., `record`, `forward_replay`, `backward_replay`) on the `RouterReplay` instances.
+    4.  **Execution Flow (within a mini-batch)**:
+        *   **Forward Pass**:
+            *   For each micro-batch, the `topk_routing_with_score_function` checks the `router_replay_action`.
+            *   **In `record` mode**: The dynamically computed `top-k` expert indices are captured and stored.
+            *   **In `forward_replay` mode**: The function retrieves pre-loaded expert indices from `target_topk_idx`. These indices are used for the forward computation and are also appended to the `replay_backward_list` to prepare for the backward pass.
+        *   **Backward Pass**:
+            *   For each micro-batch (processed in reverse order in pipeline parallelism), the `router_replay_action` is checked again.
+            *   **In `backward_replay` mode**: The function retrieves the expert indices for the corresponding micro-batch by popping them from the `replay_backward_list`. This mode is intended for training recomputation (e.g., activation checkpointing and pipeline recompute) so the same routing decisions are used during recompute/backward as in forward, ensuring determinism and correctness.
+
+## 4. Implementation Details
+
+The implementation cleanly separates the replay logic from the router's core computation.
+
+*   **`megatron/core/transformer/transformer_config.py`**:
+    *   Adds the configuration option `enable_routing_replay: bool = False`.
+
+*   **`megatron/core/transformer/moe/moe_utils.py`**:
+    *   Introduces the `RouterReplay` class to manage the state for recording and replaying routing decisions for a single MoE layer.
+        *   `target_topk_idx`: An attribute holding the expert indices for the current micro-batch during forward replay mode.
+        *   `recorded_topk_idx`: An attribute for storing the computed expert indices when in record mode.
+        *   `replay_backward_list`: A list that accumulates the `top-k` indices used during the forward passes of a mini-batch. This list is consumed in FIFO order during the backward pass to ensure correctness under pipeline parallelism.
+        *   `set_target_indices()`: A method to load the replay indices into `target_topk_idx` for the forward pass.
+        *   `record_indices()`: A method to save the computed indices.
+    *   The `topk_routing_with_score_function` is modified to contain the core logic. It checks the `router_replay_action` on the `router_replay` instance and accordingly performs one of the following actions: computes and records indices, replays indices from `target_topk_idx` (for forward), replays indices from `replay_backward_list` (for backward), or falls through to the default dynamic routing.
+
+### Training recompute usage
+- During forward replay, `set_target_indices()` prepares `replay_backward_list` so each micro-batch’s indices are available for recomputation.
+- During recompute/backward, set action to `REPLAY_BACKWARD` so indices are consumed in FIFO order to mirror the forward sequence.
+
+## 5. Usage Guide
+
+1.  **Enable & Instantiate**
+    - Create one `RouterReplay` instance per MoE router layer when building the model.
+    - Optionally use the global helpers to set/clear actions across all layers.
+2.  **Record Routing Decisions**
+    - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)`.
+    - Run the model; retrieve per-layer indices via `RouterReplay.get_recorded_data()` and persist.
+3.  **Forward Replay**
+    - Load indices and distribute: `RouterReplay.set_replay_data(list_of_tensors)`.
+    - Set action: `RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD)`.
+    - Run the model; dynamic top‑k is bypassed and target indices are used.
+4.  **Backward Replay**
+    - For training recomputation (activation checkpointing or pipeline recompute), set action: `REPLAY_BACKWARD` during recomputation.
+    - Per micro‑batch indices are consumed from `replay_backward_list` in FIFO order.
+5.  **Cleanup**
+    - Use `RouterReplay.clear_global_indices()`, `RouterReplay.clear_global_router_replay_action()`, and `RouterReplay.clear_global_router_replay_instances()` to restore default behavior and prevent memory leaks.
+
+### Quick usage with `topk_routing_with_score_function`
+
+```python
+import torch
+from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction
+from megatron.core.transformer.moe.moe_utils import topk_routing_with_score_function
+
+rr = RouterReplay()
+
+# Record
+RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)
+logits = torch.randn(8, 16)
+probs_rec, routing_map_rec = topk_routing_with_score_function(
+    logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr,
+)
+recorded = rr.get_recorded_indices()
+torch.save(recorded, "/tmp/replay.pt")
+
+# Forward replay
+rr.clear_router_replay_action()
+rr.set_router_replay_action(RouterReplayAction.REPLAY_FORWARD)
+target = torch.load("/tmp/replay.pt")
+rr.set_target_indices(target)
+probs_rep, routing_map_rep = topk_routing_with_score_function(
+    logits=logits, topk=2, use_pre_softmax=False, score_function="softmax", router_replay=rr,
+)
+
+RouterReplay.clear_global_router_replay_action()
+RouterReplay.clear_global_indices()
+RouterReplay.clear_global_router_replay_instances()
+```
+
+## 6. Minimal Demo
+
+Here is a minimal code example showing how to use RouterReplay for recording and replaying:
+
+```python
+import torch
+import torch.distributed as dist
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.moe.router import TopKRouter
+from megatron.core.transformer.moe.router_replay import RouterReplay, RouterReplayAction
+
+
+# Initialize distributed training
+if not dist.is_initialized():
+    dist.init_process_group(backend="nccl")
+
+# Create a transformer config with RouterReplay enabled
+config = TransformerConfig(
+    num_experts=8,
+    expert_model_parallel_size=1,
+    num_top_k=2,
+    enable_routing_replay=True
+)
+
+# Create a TopKRouter instance
+router = TopKRouter(config)
+
+# Generate sample input (batch_size, sequence_length, hidden_size)
+logits = torch.randn(16, 32, 8).to(torch.cuda.current_device())
+
+# -----------------
+# 1. Recording Mode
+# -----------------
+print("=== Recording Mode ===")
+# Set global router replay action to RECORD
+RouterReplay.set_global_router_replay_action(RouterReplayAction.RECORD)
+
+# Perform routing
+routing_output = router.forward(logits)
+print(f"Recorded top-k indices shape: {routing_output.top_k_idx.shape}")
+
+# -----------------
+# 2. Forward Replay Mode
+# -----------------
+print("\n=== Forward Replay Mode ===")
+# Save recorded indices to a file
+torch.save(routing_output.top_k_idx, "/tmp/replay.pt")
+
+# Load indices from file and set as target for replay
+replay_indices = torch.load("/tmp/replay.pt")
+for router_instance in RouterReplay.global_router_replay_instances:
+    router_instance.target_topk_idx = replay_indices
+
+# Set global router replay action to REPLAY_FORWARD
+RouterReplay.set_global_router_replay_action(RouterReplayAction.REPLAY_FORWARD)
+
+# Perform routing again - this will use the replayed indices
+replay_routing_output = router.forward(logits)
+print(f"Replayed top-k indices shape: {replay_routing_output.top_k_idx.shape}")
+print(f"Are indices the same? {torch.equal(routing_output.top_k_idx, replay_routing_output.top_k_idx)}")
+
+
+# Clean up
+RouterReplay.clear_global_router_replay_action()
+RouterReplay.clear_global_indices()
+RouterReplay.clear_global_router_replay_instances()
+if dist.is_initialized():
+    dist.destroy_process_group()
+```
diff --git a/docs/user-guide/features/index.md b/docs/user-guide/features/index.md
index d080fe3f256..514568afac9 100644
--- a/docs/user-guide/features/index.md
+++ b/docs/user-guide/features/index.md
@@ -20,6 +20,7 @@ context_parallel
 custom_fsdp
 dist_optimizer
 optimizer_cpu_offload
+paged_stash
 pipeline_parallel_layout
 tokenizers
 megatron_energon
diff --git a/docs/user-guide/features/paged_stash.md b/docs/user-guide/features/paged_stash.md
new file mode 100644
index 00000000000..4b7d807ace2
--- /dev/null
+++ b/docs/user-guide/features/paged_stash.md
@@ -0,0 +1,59 @@
+<!---
+   Copyright (c) 2022-2026, NVIDIA CORPORATION. All rights reserved.
+   NVIDIA CORPORATION and its licensors retain all intellectual property
+   and proprietary rights in and to this software, related documentation
+   and any modifications thereto. Any use, reproduction, disclosure or
+   distribution of this software and related documentation without an express
+   license agreement from NVIDIA CORPORATION is strictly prohibited.
+-->
+
+# MoE Paged Stash
+
+*This is an experimental feature and may change.*
+
+**Paged stash** = **sync-free** expert execution + **paged stashing** (packing routed-expert activations for backward into paged buffers).
+
+**Sync-free:** `--moe-flex-dispatcher-backend hybridep`, `--use-transformer-engine-op-fuser`, and `--moe-expert-rank-capacity-factor` pre-size dispatch and fused grouped expert buffers from a user-controlled capacity, avoiding a per-step device query / realloc loop for buffer sizing.
+
+**Paged stashing:** `--moe-paged-stash` stores those activations in paged CUDA buffers (optional pinned host spill). It helps save activation memory; sync-free still works without it, at the cost of higher activation memory use.
+
+Whenever `moe_expert_rank_capacity_factor` is set, a **runner** wraps forward-backward: after each pass it checks **stash overflow** (only with `--moe-paged-stash`) and **token over-budget**. If either hits any rank, the step **reruns once** without capacity padding and without paged stashing.
+
+## Prerequisites
+
+HybridEP + TE fused grouped experts are required whenever `moe_expert_rank_capacity_factor` is set. With `moe_paged_stash` enabled: capacity factor must be set; no `cpu_offloading`; `offload_modules` must not include `expert_fc1` or `moe_act`. The runner is active whenever capacity factor is set (even without `--moe-paged-stash`) for over-budget reruns; stash overflow is checked only when paged stashing is on.
+
+## Configuration
+
+```bash
+# Sync-free
+--moe-flex-dispatcher-backend hybridep
+--use-transformer-engine-op-fuser
+--moe-expert-rank-capacity-factor <float>
+
+# Paged stashing (to avoid memory waste due to fragmentation)
+--moe-paged-stash
+```
+
+## Tuning (paged stashing only)
+
+```bash
+# Page size for stashing
+--moe-paged-stash-page-size 64
+# CUDA stashing buffer scaling factor (default 1.10)
+--moe-paged-stash-buffer-size-factor-cuda 1.10
+# Host spill (0 = off); same sign rule as CUDA
+--moe-paged-stash-buffer-size-factor-cpu 0.0
+```
+
+## What `moe_expert_rank_capacity_factor` and `moe_paged_stash_buffer_size_factor_cuda` mean
+
+Both are **multipliers on buffer size relative to the perfectly balanced case**—the space you would need if routed tokens were evenly distributed across expert ranks. A larger factor adds headroom for real-world **skew**.
+
+## Choosing `moe_expert_rank_capacity_factor` and stash buffer scales
+
+Profile how far real routing departs from the **balanced** reference, then pick factors so **skew spikes** rarely exceed your margin (avoid constant reruns).
+
+- **`moe_expert_rank_capacity_factor`:** pick from profiles so **over-budget token drop** is uncommon; set **slightly above** the profiled value so reruns stay rare.
+- **`moe_paged_stash_buffer_size_factor_cuda`:** size from the **same stats** (peaks vs averages) so **stash overflow** is uncommon; undersizing triggers reruns like over-budget.
+- **`moe_paged_stash_buffer_size_factor_cpu`:** set **> 0** to allow **spill to pinned host** when CUDA pages are full—often **avoids overflow / rerun** at the cost of host memory and more overhead from paged stashing.
diff --git a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
index 183c6c695bd..a8b72bb39ae 100644
--- a/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
+++ b/examples/academic_paper_scripts/detxoify_lm/generate_samples_gpt.py
@@ -6,28 +6,33 @@
 import json
 import os
 import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir, os.path.pardir)))
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+from typing import Union
+
 import torch
-from megatron.training import get_args
-from megatron.training import get_tokenizer
-from megatron.training import print_rank_0
-from megatron.training.checkpointing import load_checkpoint
+
+import megatron.legacy.model
 from megatron.core import mpu
-from megatron.training.arguments import parse_and_validate_args
-from megatron.training.initialize import initialize_megatron
-from megatron.legacy.model import GPTModel
-from megatron.training import get_model
-from megatron.inference.text_generation import generate_and_post_process
-from megatron.training.arguments import core_transformer_config_from_args
 from megatron.core.models.gpt import GPTModel
-from typing import Union
-import megatron.legacy.model
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
 from megatron.core.transformer.spec_utils import import_module
-from megatron.training.arguments import core_transformer_config_from_args
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec, get_gpt_layer_local_spec
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.legacy.model import GPTModel
+from megatron.training import get_args, get_model, get_tokenizer, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args, parse_and_validate_args
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
 
-def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
+
+def model_provider(
+    pre_process=True, post_process=True
+) -> Union[GPTModel, megatron.legacy.model.GPTModel]:
     """Builds the model.
 
     If you set the use_legacy_models to True, it will return the legacy GPT model and if not the core GPT model.
@@ -51,26 +56,23 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             num_tokentypes=0,
             parallel_output=False,
             pre_process=pre_process,
-            post_process=post_process
+            post_process=post_process,
         )
     else:
         if args.spec is None:
             if args.transformer_impl == 'local':
                 transformer_layer_spec = get_gpt_layer_local_spec(
-                    num_experts=args.num_experts,
-                    moe_grouped_gemm=args.moe_grouped_gemm
+                    num_experts=args.num_experts, moe_grouped_gemm=args.moe_grouped_gemm
                 )
             elif args.transformer_impl == 'transformer_engine':
                 transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
-                    num_experts=args.num_experts,
-                    moe_grouped_gemm=args.moe_grouped_gemm
+                    num_experts=args.num_experts, moe_grouped_gemm=args.moe_grouped_gemm
                 )
             else:
                 raise ValueError(f"Invalid transformer_impl {args.transformer_impl}")
         elif args.spec[0] == 'local':
             transformer_layer_spec = get_gpt_layer_local_spec(
-                num_experts=args.num_experts,
-                moe_grouped_gemm=args.moe_grouped_gemm
+                num_experts=args.num_experts, moe_grouped_gemm=args.moe_grouped_gemm
             )
         else:
             transformer_layer_spec = import_module(args.spec)
@@ -86,37 +88,46 @@ def model_provider(pre_process=True, post_process=True) -> Union[GPTModel, megat
             parallel_output=False,
             share_embeddings_and_output_weights=not args.untie_embeddings_and_output_weights,
             position_embedding_type=args.position_embedding_type,
-            rotary_percent=args.rotary_percent
+            rotary_percent=args.rotary_percent,
         )
 
     return model
 
+
 def add_text_generate_args(parser):
     """Text generation arguments."""
     group = parser.add_argument_group(title='text generation')
 
-    group.add_argument("--temperature", type=float, default=1.0,
-                       help='Sampling temperature.')
-    group.add_argument("--greedy", action='store_true', default=False,
-                       help='Use greedy sampling.')
-    group.add_argument("--top_p", type=float, default=0.0,
-                       help='Top p sampling.')
-    group.add_argument("--top_k", type=int, default=0,
-                       help='Top k sampling.')
-    group.add_argument("--out-seq-length", type=int, default=1024,
-                       help='Size of the output generated text.')
-    group.add_argument("--sample-input-file", type=str, default=None,
-                       help='Get input from file instead of interactive mode, '
-                       'each line is an input.')
-    group.add_argument("--sample-output-file", type=str, default=None,
-                       help='Output file got from --sample-input-file')
-    group.add_argument("--num-samples", type=int, default=0,
-                       help='Number of samples to generate unconditionally, '
-                       'defaults to 0 and interactive conditional sampling')
-    group.add_argument("--genfile", type=str,
-                       help='Output file when generating unconditionally')
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--greedy", action='store_true', default=False, help='Use greedy sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
+    group.add_argument(
+        "--out-seq-length", type=int, default=1024, help='Size of the output generated text.'
+    )
+    group.add_argument(
+        "--sample-input-file",
+        type=str,
+        default=None,
+        help='Get input from file instead of interactive mode, ' 'each line is an input.',
+    )
+    group.add_argument(
+        "--sample-output-file",
+        type=str,
+        default=None,
+        help='Output file got from --sample-input-file',
+    )
+    group.add_argument(
+        "--num-samples",
+        type=int,
+        default=0,
+        help='Number of samples to generate unconditionally, '
+        'defaults to 0 and interactive conditional sampling',
+    )
+    group.add_argument("--genfile", type=str, help='Output file when generating unconditionally')
     return parser
 
+
 def generate_samples_unconditional(model):
     args = get_args()
 
@@ -124,6 +135,7 @@ def generate_samples_unconditional(model):
         cnt = 0
         num_samples = args.num_samples
         from tqdm import tqdm
+
         pbar = tqdm(total=num_samples)
 
     while True:
@@ -131,16 +143,23 @@ def generate_samples_unconditional(model):
             sentences = [''] * args.global_batch_size
             print("global batch size", args.global_batch_size)
             max_len = args.out_seq_length
-            resp_sentences, resp_sentences_seg, output_logits, \
-            tokens = generate_and_post_process(model, prompts=sentences,
-                                               tokens_to_generate=max_len,
-                                               return_output_log_probs=False,
-                                               top_k_sampling=args.top_k,
-                                               top_p_sampling=args.top_p,
-                                               add_BOS=True,
-                                               temperature=1.0)
+            resp_sentences, resp_sentences_seg, output_logits, tokens = generate_and_post_process(
+                model,
+                prompts=sentences,
+                tokens_to_generate=max_len,
+                return_output_log_probs=False,
+                top_k_sampling=args.top_k,
+                top_p_sampling=args.top_p,
+                add_BOS=True,
+                temperature=1.0,
+            )
             for prompt, generation, token in zip(sentences, resp_sentences, tokens):
-                datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt}
+                datum = {
+                    'text': generation[len(prompt) :],
+                    'all_text': generation,
+                    'prompt': prompt,
+                    'id': cnt,
+                }
                 yield datum
                 cnt += 1
                 pbar.update()
@@ -161,6 +180,7 @@ def generate_samples_conditional(model):
         num_samples = args.num_samples
         cnt = 0
         from tqdm import tqdm
+
         pbar = tqdm(total=num_samples)
 
         fname = open(args.sample_input_file, "r")
@@ -184,16 +204,23 @@ def generate_samples_conditional(model):
                 sentences.append(raw_text)
 
             max_len = args.out_seq_length
-            resp_sentences, resp_sentences_seg, output_logits, \
-            tokens = generate_and_post_process(model, prompts=sentences,
-                                               tokens_to_generate=max_len,
-                                               return_output_log_probs=False,
-                                               top_k_sampling=args.top_k,
-                                               top_p_sampling=args.top_p,
-                                               add_BOS=False,
-                                               temperature=1.0)
+            resp_sentences, resp_sentences_seg, output_logits, tokens = generate_and_post_process(
+                model,
+                prompts=sentences,
+                tokens_to_generate=max_len,
+                return_output_log_probs=False,
+                top_k_sampling=args.top_k,
+                top_p_sampling=args.top_p,
+                add_BOS=False,
+                temperature=1.0,
+            )
             for prompt, generation, token in zip(sentences, resp_sentences, tokens):
-                datum = {'text': generation[len(prompt):], 'all_text': generation, 'prompt': prompt, 'id': cnt}
+                datum = {
+                    'text': generation[len(prompt) :],
+                    'all_text': generation,
+                    'prompt': prompt,
+                    'id': cnt,
+                }
                 yield datum
                 cnt += 1
                 pbar.update()
@@ -220,8 +247,7 @@ def generate_and_write_samples_conditional(model):
     args = get_args()
     if args.sample_output_file is None:
         sample_output_file = args.sample_input_file + ".out"
-        print('`sample-output-file` not specified, setting '
-              'it to {}'.format(sample_output_file))
+        print('`sample-output-file` not specified, setting ' 'it to {}'.format(sample_output_file))
     else:
         sample_output_file = args.sample_output_file
     with open(sample_output_file, 'w') as f:
@@ -233,11 +259,15 @@ def generate_and_write_samples_conditional(model):
 def main():
     """Main program."""
 
-    parse_and_validate_args(extra_args_provider=add_text_generate_args,
-                            args_defaults={'tokenizer_type': 'GPT2BPETokenizer',
-                                           'no_load_rng': True,
-                                           'no_load_optim': True,
-                                           'seq_length': 2048})
+    parse_and_validate_args(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'tokenizer_type': 'GPT2BPETokenizer',
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'seq_length': 2048,
+        },
+    )
     initialize_megatron()
 
     # Set up model and load checkpoint
diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py
index 02a257c1b46..e5df38fe856 100644
--- a/examples/inference/gpt/gpt_dynamic_inference.py
+++ b/examples/inference/gpt/gpt_dynamic_inference.py
@@ -11,10 +11,11 @@
 from collections import defaultdict
 from typing import Dict, List, Optional
 
-from megatron.training.arguments import parse_and_validate_args
 import torch
 from tqdm import tqdm
 
+from megatron.training.arguments import parse_and_validate_args
+
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
 )
@@ -241,7 +242,10 @@ def _process_step_result(result):
 
                 # Suspend.
                 if attempted_step_count % args.suspend_resume_interval == 0:
-                    print("**** step %d/%d ... suspend." % (engine.context.step_count, attempted_step_count))
+                    print(
+                        "**** step %d/%d ... suspend."
+                        % (engine.context.step_count, attempted_step_count)
+                    )
                     engine.suspend()
 
                 # Resume, 0+ attempted steps later.
@@ -251,7 +255,10 @@ def _process_step_result(result):
                     % args.suspend_resume_interval
                     == 0
                 ):
-                    print("**** step %d/%d ... resume." % (engine.context.step_count, attempted_step_count))
+                    print(
+                        "**** step %d/%d ... resume."
+                        % (engine.context.step_count, attempted_step_count)
+                    )
                     engine.resume()
 
             # If engine suspended, continue to next iter.
@@ -469,7 +476,9 @@ def escape_str(s):
             # Attach peak memory metrics; the functional test only validates these
             # if the fields exist in the golden values.
             json_results.update(peak_mem_stats)
-            json_results["lifetime_prefill_token_count"] = engine.context.lifetime_prefill_token_count
+            json_results["lifetime_prefill_token_count"] = (
+                engine.context.lifetime_prefill_token_count
+            )
 
             print(f' Saving results to {args.output_path}')
             with open(args.output_path, "w") as fp:
diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
index 247404d537e..31c2b3529de 100644
--- a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
+++ b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
@@ -9,7 +9,6 @@
 from collections import defaultdict
 from typing import List
 
-from megatron.training.arguments import parse_and_validate_args
 import torch
 import torch.distributed as dist
 
@@ -25,6 +24,7 @@
     get_model_for_inference,
 )
 from megatron.training import get_args, get_tokenizer, initialize_megatron
+from megatron.training.arguments import parse_and_validate_args
 
 # pylint: disable=line-too-long
 
@@ -73,11 +73,15 @@ async def main(
     )
 
     # All ranks agree on the number of suspend/resume cycles from args.
-    num_suspend_resume_cycles = len(requests) // args.suspend_resume_interval if args.suspend_resume_interval else 0
+    num_suspend_resume_cycles = (
+        len(requests) // args.suspend_resume_interval if args.suspend_resume_interval else 0
+    )
 
     # Create client and run example.
     if dist.get_rank() == 0:
-        client = InferenceClient(dp_addr, deserialize=True)  # submits requests to the inference coordinator
+        client = InferenceClient(
+            dp_addr, deserialize=True
+        )  # submits requests to the inference coordinator
         client.start()
         base_arrival_time = time.time_ns() / 10**9
         for request in requests:
@@ -103,7 +107,10 @@ async def main(
                     futures.append(client.add_request(request.prompt_text, request.sampling_params))
                     num_requests_added += 1
 
-                    if num_requests_added >= next_suspend_at and cycles_done < num_suspend_resume_cycles:
+                    if (
+                        num_requests_added >= next_suspend_at
+                        and cycles_done < num_suspend_resume_cycles
+                    ):
                         await suspend_resume_cycle(client, engine, args, futures)
                         cycles_done += 1
                         next_suspend_at += args.suspend_resume_interval
@@ -120,7 +127,10 @@ async def main(
                     futures.append(client.add_request(request.prompt_text, request.sampling_params))
                     num_requests_added += 1
 
-                    if num_requests_added >= next_suspend_at and cycles_done < num_suspend_resume_cycles:
+                    if (
+                        num_requests_added >= next_suspend_at
+                        and cycles_done < num_suspend_resume_cycles
+                    ):
                         await suspend_resume_cycle(client, engine, args, futures)
                         cycles_done += 1
                         next_suspend_at += args.suspend_resume_interval
@@ -159,7 +169,7 @@ async def main(
                 throughputs.append(throughput)
                 if req.routing_indices is not None:
                     result_dict["routing_indices"] = req.routing_indices.tolist()
-                                
+
                 json_results[req.request_id] = result_dict
             throughput_dict = {"throughput": throughputs}
             if args.throughput_check_only:
diff --git a/examples/inference/gpt/gpt_static_inference.py b/examples/inference/gpt/gpt_static_inference.py
index d3dd619eaa1..9c748fdf795 100644
--- a/examples/inference/gpt/gpt_static_inference.py
+++ b/examples/inference/gpt/gpt_static_inference.py
@@ -5,7 +5,6 @@
 import time
 from argparse import Namespace
 
-from megatron.training.arguments import parse_and_validate_args
 import torch
 
 from megatron.core.inference.contexts import StaticInferenceContext
@@ -20,6 +19,7 @@
 )
 from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer
 from megatron.core.transformer.module import MegatronModule
+from megatron.training.arguments import parse_and_validate_args
 
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
diff --git a/examples/mimo/train.py b/examples/mimo/train.py
index a89c83728e0..05eb4f2ab0c 100644
--- a/examples/mimo/train.py
+++ b/examples/mimo/train.py
@@ -9,26 +9,24 @@
 from functools import partial
 from typing import Any, Dict, Iterator
 
-from megatron.training.arguments import parse_and_validate_args
 import torch
-from megatron.training import get_args, pretrain, print_rank_0
 
 from megatron.core.parallel_state import (
+    get_context_parallel_group,
+    get_data_parallel_group,
     get_tensor_model_parallel_group,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_src_rank,
-    get_context_parallel_group,
-    get_data_parallel_group,
 )
+from megatron.training import get_args, pretrain, print_rank_0
+from megatron.training.arguments import parse_and_validate_args
 
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
 )
 from data.energon_avlm_task_encoder import llava_avlm_dataloader_provider
 from data.energon_vlm_task_encoder import llava_vlm_dataloader_provider
-from data.mock import (
-    train_valid_test_datasets_provider as mock_train_valid_test_datasets_provider,
-)
+from data.mock import train_valid_test_datasets_provider as mock_train_valid_test_datasets_provider
 from model_providers.llava_avlm import model_provider_llava_avlm
 from model_providers.llava_vlm import model_provider_llava_vlm
 from model_providers.mock import model_provider_mock_vlm_single_encoder
@@ -50,13 +48,24 @@
     "llava_avlm": llava_avlm_dataloader_provider,
 }
 
+
 def add_mimo_args(parser):
     """Add MIMO-specific arguments to the parser."""
     group = parser.add_argument_group('MIMO', 'MIMO specific arguments')
 
     # MIMO-specific parameters
-    group.add_argument('--dataset-provider', type=str, default='mock', help='Dataset provider to choose from [mock, llava_vlm, video_llava_vlm, llava_avlm]')
-    group.add_argument('--model-provider', type=str, default='mock', help='Model provider to choose from [mock, llava_vlm, video_llava_vlm, llava_avlm]')
+    group.add_argument(
+        '--dataset-provider',
+        type=str,
+        default='mock',
+        help='Dataset provider to choose from [mock, llava_vlm, video_llava_vlm, llava_avlm]',
+    )
+    group.add_argument(
+        '--model-provider',
+        type=str,
+        default='mock',
+        help='Model provider to choose from [mock, llava_vlm, video_llava_vlm, llava_avlm]',
+    )
 
     # mock dataloader related args
     # can control mock samples with total seq length and image seq length
@@ -71,17 +80,29 @@ def add_mimo_args(parser):
         '--audio-encoder-model', type=str, default=None, help='Audio encoder model name'
     )
     group.add_argument(
-        '--hf-assign-unused-tokens', type=str, nargs='+', default=None,
-                       help='Assigning unused tokens to special tokens. Example: '
-                       '--hf-assign-unused-tokens "<audio>,32002" "<video>,32003"'
+        '--hf-assign-unused-tokens',
+        type=str,
+        nargs='+',
+        default=None,
+        help='Assigning unused tokens to special tokens. Example: '
+        '--hf-assign-unused-tokens "<audio>,32002" "<video>,32003"',
     )
     # checkpoint related args
-    group.add_argument('--language-model-checkpoint', type=str, default=None, help='Path to language model checkpoint to load')
+    group.add_argument(
+        '--language-model-checkpoint',
+        type=str,
+        default=None,
+        help='Path to language model checkpoint to load',
+    )
     # energon dataloader related args
-    group.add_argument('--packing-buffer-size', type=int, default=None, help='Packing buffer size when using sequence packing')
-    
-    return parser
+    group.add_argument(
+        '--packing-buffer-size',
+        type=int,
+        default=None,
+        help='Packing buffer size when using sequence packing',
+    )
 
+    return parser
 
 
 def get_batch(data_iterator: Iterator[Dict[str, Any]]):
@@ -96,9 +117,10 @@ def get_batch(data_iterator: Iterator[Dict[str, Any]]):
     args = get_args()
 
     # Assert that pipeline parallelism are not supported yet
-    assert (getattr(args, 'pipeline_model_parallel_size', 1) == 1), \
-        "Pipeline parallelism is not supported yet in MIMO implementation"
-    
+    assert (
+        getattr(args, 'pipeline_model_parallel_size', 1) == 1
+    ), "Pipeline parallelism is not supported yet in MIMO implementation"
+
     # Broadcast data - only get data on tensor parallel rank 0
     # data iterator is None on other tp ranks
     # TP Rank-0 reads next batch.
@@ -121,7 +143,7 @@ def get_batch(data_iterator: Iterator[Dict[str, Any]]):
         # we need this to avoid race condition when first tp rank hits StopIteration
         return None
 
-    # MiMo forward pass expects 
+    # MiMo forward pass expects
     # input_ids: torch.Tensor,
     # position_ids: Optional[torch.Tensor] = None,
     # attention_mask: Optional[torch.Tensor] = None,
@@ -137,6 +159,7 @@ def get_batch(data_iterator: Iterator[Dict[str, Any]]):
 
     return batch
 
+
 def loss_func(loss_mask, output_tensor):
     """Simple loss function for MIMO model training.
 
@@ -151,20 +174,18 @@ def loss_func(loss_mask, output_tensor):
 
     loss_mask = loss_mask.contiguous().view(-1).float()
 
-    
-
     total_tokens = loss_mask.sum().clone().detach().to(torch.int)
     total_loss = torch.sum(losses.view(-1) * loss_mask)
 
     loss = torch.cat([total_loss.view(1), total_tokens.view(1)])
 
     loss_for_backward = loss[0].clone()
-    # If CP is active, reduce the loss across all CP ranks 
+    # If CP is active, reduce the loss across all CP ranks
     # as they have loss calculated for their own sequence shards.
     if args.context_parallel_size > 1:
         torch.distributed.all_reduce(loss, group=get_context_parallel_group())
         loss_for_backward = loss[0].clone()
-    # For reporting, clone and detach the loss. This creates a new tensor 
+    # For reporting, clone and detach the loss. This creates a new tensor
     # that doesn't require gradients and is independent of the computation graph.
     reporting_loss = loss.clone().detach()
     torch.distributed.all_reduce(reporting_loss, group=get_data_parallel_group())
@@ -186,7 +207,7 @@ def forward_step(data_iterator, model):
     """
     data_batch = get_batch(data_iterator)
     output_tensor, loss_mask = model(**data_batch)
-    
+
     # Return output and loss function
     return output_tensor, partial(loss_func, loss_mask)
 
@@ -204,8 +225,10 @@ def train_valid_test_datasets_provider(*provider_args, **provider_kwargs):
         if runtime_args.dataset_provider != "mock":
             # Calculate max_seq_length from total_seq_length
             max_seq_length = runtime_args.total_seq_length
-            print_rank_0(f"MIMO Training: Using max_seq_length = {max_seq_length} "
-                f"(total_seq_length: {runtime_args.total_seq_length})")
+            print_rank_0(
+                f"MIMO Training: Using max_seq_length = {max_seq_length} "
+                f"(total_seq_length: {runtime_args.total_seq_length})"
+            )
 
             # Add configs to provider_kwargs
             provider_kwargs['max_seq_length'] = max_seq_length
@@ -217,6 +240,7 @@ def train_valid_test_datasets_provider(*provider_args, **provider_kwargs):
 
     return dataset_provider(*provider_args, **provider_kwargs)
 
+
 def model_provider(
     pre_process: bool = True,
     post_process: bool = True,
@@ -263,18 +287,15 @@ def model_provider(
             "pg_collection": pg_collection,
         }
     else:
-        raise ValueError(f"Unknown model provider: {runtime_args.model_provider}. Must be one of ['llava_vlm', 'llava_avlm', 'mock]")
-
-    return builder_fn(
-        pre_process,
-        post_process,
-        add_encoder,
-        add_decoder,
-        **builder_kwargs,
-    )
+        raise ValueError(
+            f"Unknown model provider: {runtime_args.model_provider}. Must be one of ['llava_vlm', 'llava_avlm', 'mock]"
+        )
+
+    return builder_fn(pre_process, post_process, add_encoder, add_decoder, **builder_kwargs)
+
 
 if __name__ == "__main__":
-    
+
     train_valid_test_datasets_provider.is_distributed = True
     parse_and_validate_args(args_defaults={}, extra_args_provider=add_mimo_args)
     pretrain(
diff --git a/examples/multimodal/run_text_generation.py b/examples/multimodal/run_text_generation.py
index 532d0771a94..10f3add18e7 100644
--- a/examples/multimodal/run_text_generation.py
+++ b/examples/multimodal/run_text_generation.py
@@ -5,7 +5,7 @@
 import os
 import sys
 from functools import partial
-from typing import List, Dict
+from typing import Dict, List
 
 # Add megatron to the path.
 sys.path.append(
@@ -21,24 +21,24 @@
 
 from megatron.core import parallel_state
 from megatron.core.enums import ModelType
-from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
-from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
-from megatron.inference.text_generation.api import generate_and_post_process
-from megatron.inference.text_generation.forward_step import ForwardStep
 from megatron.core.inference.contexts import StaticInferenceContext
-from megatron.core.inference.sampling_params import SamplingParams
 from megatron.core.inference.engines import StaticInferenceEngine
 from megatron.core.inference.inference_request import InferenceRequest, VLMInferenceRequest
-from megatron.core.inference.text_generation_controllers.vlm_text_generation_controller import (
-    VLMTextGenerationController,
-)
 from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
     InferenceWrapperConfig,
 )
 from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import (
     VLMInferenceWrapper,
 )
-from megatron.training import get_args, get_model, get_tokenizer, print_rank_0, is_last_rank
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.text_generation_controllers.vlm_text_generation_controller import (
+    VLMTextGenerationController,
+)
+from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
+from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
+from megatron.inference.text_generation.api import generate_and_post_process
+from megatron.inference.text_generation.forward_step import ForwardStep
+from megatron.training import get_args, get_model, get_tokenizer, is_last_rank, print_rank_0
 from megatron.training.arguments import parse_and_validate_args
 from megatron.training.checkpointing import load_checkpoint
 from megatron.training.initialize import initialize_megatron
@@ -120,7 +120,7 @@ def get_evaluation_dataloader(
     num_frames,
     num_workers,
     vision_model_type,
-    split="validation"
+    split="validation",
 ):
     """Build evaluation dataset."""
     dataset = get_evaluation_dataset(
@@ -137,7 +137,7 @@ def get_evaluation_dataloader(
         partition_id,
         num_frames,
         vision_model_type,
-        split=split
+        split=split,
     )
 
     dp_rank = parallel_state.get_data_parallel_rank()
@@ -173,7 +173,7 @@ def generate_samples(model, config: EvaluationConfig, print_output):
         args.num_frames,
         args.num_workers,
         args.vision_model_type,
-        config.split
+        config.split,
     )
 
     num_img_embeddings_per_tile = get_num_image_embeddings(
@@ -219,21 +219,29 @@ def generate_samples(model, config: EvaluationConfig, print_output):
         conv = get_conversation(config.task, question, metadata)
 
         if not args.use_mcore_inference:
-            forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles, args.decoder_seq_length)
+            forward_step = partial(
+                VLMForwardStep,
+                num_img_embeddings_per_tile,
+                imgs,
+                num_tiles,
+                args.decoder_seq_length,
+            )
 
-        inference_context = StaticInferenceContext(max_batch_size=1, max_sequence_length=args.inference_max_seq_length)
+        inference_context = StaticInferenceContext(
+            max_batch_size=1, max_sequence_length=args.inference_max_seq_length
+        )
         if is_first_rank():
 
             if args.use_mcore_inference:
                 inference_request = VLMInferenceRequest(
-                   request_id=inference_engine.get_new_request_id(),
-                   prompt=conv,
-                   prompt_tokens=controller.tokenize_prompt(controller.tokenizer, conv),
-                   sampling_params=sampling_params,
-                   num_img_embeddings_per_tile=num_img_embeddings_per_tile,
-                   imgs=imgs,
-                   num_tiles=num_tiles,
-                   decoder_seq_length=args.decoder_seq_length,
+                    request_id=inference_engine.get_new_request_id(),
+                    prompt=conv,
+                    prompt_tokens=controller.tokenize_prompt(controller.tokenizer, conv),
+                    sampling_params=sampling_params,
+                    num_img_embeddings_per_tile=num_img_embeddings_per_tile,
+                    imgs=imgs,
+                    num_tiles=num_tiles,
+                    decoder_seq_length=args.decoder_seq_length,
                 )
                 results: List[InferenceRequest] = inference_engine.generate(
                     inference_requests=[inference_request]
@@ -245,7 +253,8 @@ def generate_samples(model, config: EvaluationConfig, print_output):
                 ]
             else:
                 resp_sentences, _, _, _ = generate_and_post_process(
-                    model, inference_context,
+                    model,
+                    inference_context,
                     forward_step=forward_step,
                     prompts=[conv],
                     tokens_to_generate=config.out_seq_length,
@@ -256,7 +265,7 @@ def generate_samples(model, config: EvaluationConfig, print_output):
                     random_seed=args.seed,
                     detokenize_segments=False,
                     data_parallel=True,
-            )
+                )
 
             for generation in resp_sentences:
                 if isinstance(sample_id, torch.Tensor):
@@ -343,21 +352,23 @@ def generate_samples(model, config: EvaluationConfig, print_output):
         else:
             if args.use_mcore_inference:
                 inference_request = VLMInferenceRequest(
-                   request_id=inference_engine.get_new_request_id(),
-                   prompt=conv,
-                   prompt_tokens=controller.tokenize_prompt(controller.tokenizer, conv),
-                   sampling_params=sampling_params,
-                   num_img_embeddings_per_tile=num_img_embeddings_per_tile,
-                   imgs=imgs,
-                   num_tiles=num_tiles,
-                   decoder_seq_length=args.decoder_seq_length,
-                )
-                inference_engine.generate(
-                    inference_requests=[inference_request]
+                    request_id=inference_engine.get_new_request_id(),
+                    prompt=conv,
+                    prompt_tokens=controller.tokenize_prompt(controller.tokenizer, conv),
+                    sampling_params=sampling_params,
+                    num_img_embeddings_per_tile=num_img_embeddings_per_tile,
+                    imgs=imgs,
+                    num_tiles=num_tiles,
+                    decoder_seq_length=args.decoder_seq_length,
                 )
+                inference_engine.generate(inference_requests=[inference_request])
             else:
                 generate_and_post_process(
-                    model, inference_context, forward_step=forward_step, detokenize_segments=False, data_parallel=True
+                    model,
+                    inference_context,
+                    forward_step=forward_step,
+                    detokenize_segments=False,
+                    data_parallel=True,
                 )
 
             idx += 1
@@ -455,6 +466,7 @@ def generate_and_write_samples(model, config, print_output=True):
     if is_first_rank():
         output_file.close()
 
+
 class VLMForwardStep(ForwardStep):
     """Inference forward step for a multimodal model."""
 
@@ -502,14 +514,19 @@ def __call__(self, tokens, position_ids, attention_mask):
             if self._recv_only_vision_embeds:
                 recv_buffer_seq_length = self._num_img_embeddings
             else:
-                recv_buffer_seq_length = min(self._num_img_embeddings + num_tokens - num_image_tokens, self.decoder_seq_length)
+                recv_buffer_seq_length = min(
+                    self._num_img_embeddings + num_tokens - num_image_tokens,
+                    self.decoder_seq_length,
+                )
         elif self._recv_only_vision_embeds:
             # If this stage only receives vision embeddings and there are no image tokens we won't run the encoder and therefore shouldn't try to recv.
             recv_buffer_seq_length = 0
 
         # If the pipeline stage only has a vision encoder, then it only needs to run when there are image tokens
         if not (self._encoder_only and num_image_tokens == 0):
-            output = super().__call__(tokens, position_ids, attention_mask, recv_buffer_seq_length=recv_buffer_seq_length)
+            output = super().__call__(
+                tokens, position_ids, attention_mask, recv_buffer_seq_length=recv_buffer_seq_length
+            )
         else:
             output = None
         if isinstance(output, tuple):
@@ -522,13 +539,16 @@ def __call__(self, tokens, position_ids, attention_mask):
         # update the sequence length offset by the number of image tokens.
         if num_tokens > 1 and num_image_tokens > 0:
             if "image_tokens_count" not in self.inference_context.key_value_memory_dict:
-                self.inference_context.key_value_memory_dict["image_tokens_count"] = self._num_img_embeddings
+                self.inference_context.key_value_memory_dict["image_tokens_count"] = (
+                    self._num_img_embeddings
+                )
 
             if self._num_img_embeddings + num_tokens - num_image_tokens > self.decoder_seq_length:
                 self.inference_context.sequence_len_offset += self.decoder_seq_length - num_tokens
             else:
                 self.inference_context.sequence_len_offset += (
-                    self.inference_context.key_value_memory_dict["image_tokens_count"] - num_image_tokens
+                    self.inference_context.key_value_memory_dict["image_tokens_count"]
+                    - num_image_tokens
                 )
 
         return logits
@@ -599,7 +619,10 @@ def get_conversation(task, question, metadata=None):
         ]
     elif task == "MathVista":
         conversation = [
-            {"role": "system", "content": "You are math expert. Use your math knowledge to calculate the answer."},
+            {
+                "role": "system",
+                "content": "You are math expert. Use your math knowledge to calculate the answer.",
+            },
             {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
         ]
     elif task == "RealworldQA":
@@ -613,13 +636,17 @@ def get_conversation(task, question, metadata=None):
             {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
         ]
     elif task == "MotionBench":
-        extra_instruction = "Respond with only the letter choice (A, B, C, or D) of the correct option.\n"
+        extra_instruction = (
+            "Respond with only the letter choice (A, B, C, or D) of the correct option.\n"
+        )
         conversation = [
             {"role": "system", "content": "Answer the questions."},
             {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\n{extra_instruction}"},
         ]
     elif task == "PhysGameBench":
-        extra_instruction = "Respond with only the letter choice (A, B, C, or D) of the correct option.\n"
+        extra_instruction = (
+            "Respond with only the letter choice (A, B, C, or D) of the correct option.\n"
+        )
         conversation = [
             {"role": "system", "content": "Answer the questions."},
             {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\n{extra_instruction}"},
@@ -627,7 +654,10 @@ def get_conversation(task, question, metadata=None):
     elif task == "MVBench":
         conversation = [
             {"role": "system", "content": "Answer the questions."},
-            {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase."},
+            {
+                "role": "user",
+                "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
+            },
         ]
     elif task in ["PerceptionTest"]:
         conversation = [
@@ -642,7 +672,6 @@ def get_conversation(task, question, metadata=None):
     else:
         raise NotImplementedError(f"No prompting support for task {task}")
 
-
     return conversation
 
 
@@ -694,64 +723,87 @@ def run_eval(config, iteration=None):
 
     if config.task == "TextVQA":
         from evaluation.evaluate_textvqa import textvqa_eval
+
         avg_acc = textvqa_eval(config.output_path)
 
         score = {"TextVQA accuracy": avg_acc}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} at iteration={iteration} TextVQA accuracy: {score}\n")
+            f.write(
+                f"{config.task} {config.dataset} at iteration={iteration} TextVQA accuracy: {score}\n"
+            )
 
     elif config.task == "OCRBench":
         from evaluation.evaluate_ocrbench import ocrbench_eval
+
         log, avg_acc = ocrbench_eval(config.output_path)
 
         score = {"OCRBench accuracy": avg_acc}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} at iteration={iteration} OCRBench accuracy: {score}\n")
+            f.write(
+                f"{config.task} {config.dataset} at iteration={iteration} OCRBench accuracy: {score}\n"
+            )
             f.write(f"{log}\n")
 
     elif config.task == "MathVista":
         from evaluation.evaluate_mathvista import mathvista_eval
+
         avg_acc = mathvista_eval(config.output_path)
 
         score = {"MathVista accuracy": avg_acc}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} at iteration={iteration} MathVista accuracy: {score}\n")
+            f.write(
+                f"{config.task} {config.dataset} at iteration={iteration} MathVista accuracy: {score}\n"
+            )
 
     elif config.task == "ChartQA":
         from evaluation.evaluate_chartqa import chartqa_eval
+
         avg_acc = chartqa_eval(config.output_path)
 
         score = {"ChartQA accuracy": avg_acc}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} at iteration={iteration} ChartQA accuracy: {score}\n")
+            f.write(
+                f"{config.task} {config.dataset} at iteration={iteration} ChartQA accuracy: {score}\n"
+            )
 
     elif config.task == "SPDocVQA":
         from evaluation.evaluate_spdocvqa import spdocvqa_eval
+
         avg_acc = spdocvqa_eval(config.output_path)
 
         score = {"SPDocVQA accuracy": avg_acc}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} at iteration={iteration} SPDocVQA accuracy: {score}\n")
+            f.write(
+                f"{config.task} {config.dataset} at iteration={iteration} SPDocVQA accuracy: {score}\n"
+            )
 
     elif config.task == "RealworldQA":
         from evaluation.evaluate_realworldqa import realworldqa_eval
+
         avg_acc = realworldqa_eval(config.output_path)
 
         score = {"RealworldQA accuracy": avg_acc}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} at iteration={iteration} RealworldQA accuracy: {score}\n")
+            f.write(
+                f"{config.task} {config.dataset} at iteration={iteration} RealworldQA accuracy: {score}\n"
+            )
 
     elif config.task == "AI2D":
         from evaluation.evaluate_ai2d import ai2d_eval
+
         avg_acc = ai2d_eval(config.output_path)
 
         score = {f"AI2D {config.dataset} accuracy": avg_acc}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} at iteration={iteration} AI2D accuracy: {score}\n")
+            f.write(
+                f"{config.task} {config.dataset} at iteration={iteration} AI2D accuracy: {score}\n"
+            )
 
     elif config.task == "MMMU":
         from evaluation.evaluate_mmmu import convert_to_mmmu_format
+
         from examples.multimodal.evaluation.mmmu_utils import mmmu_main_eval
+
         result_file = convert_to_mmmu_format(config.output_path)
         result = json.load(open(result_file))
         mmmu_results = mmmu_main_eval(result, {"answer_dict": config.gt_path})
@@ -766,13 +818,17 @@ def run_eval(config, iteration=None):
         score = {"MMMU val accuracy": mmmu_results['Overall']['acc']}
     elif config.task == 'captioning':
         from evaluation.evaluate_coco import coco_captioning_eval
+
         cider_score = coco_captioning_eval(config.output_path, config.gt_path)
         score = {f"{config.task} {config.dataset} CIDEr": cider_score}
 
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} CIDEr scores at iteration={iteration}: {cider_score}\n")
+            f.write(
+                f"{config.task} {config.dataset} CIDEr scores at iteration={iteration}: {cider_score}\n"
+            )
     elif config.task == 'MotionBench':
         from evaluation.evaluate_video_motionbench import motionbench_eval
+
         avg_acc = motionbench_eval(config.output_path)
 
         score = {f"MotionBench accuracy": avg_acc}
@@ -780,18 +836,24 @@ def run_eval(config, iteration=None):
             f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {score}\n")
     elif config.task == 'PhysGameBench':
         from evaluation.evaluate_video_phys_game_bench import phys_game_bench_eval
+
         avg_acc_dict = phys_game_bench_eval(config.output_path)
 
         score = {f"PhysGame Total accuracy": avg_acc_dict['Physgame-Total-Acc']}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {avg_acc_dict}\n")
+            f.write(
+                f"{config.task} {config.dataset} scores at iteration={iteration}: {avg_acc_dict}\n"
+            )
     elif config.task == "MVBench":
         from evaluation.evaluate_video_mvbench import mvbench_eval
+
         avg_acc_dict = mvbench_eval(config.output_path)
 
         score = {f"MVBench accuracy": avg_acc_dict['total-acc']}
         with open(config.output_path + "-scores.txt", "a") as f:
-            f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {avg_acc_dict}\n")
+            f.write(
+                f"{config.task} {config.dataset} scores at iteration={iteration}: {avg_acc_dict}\n"
+            )
     elif config.task == "inference":
         score = {"Inference accuracy:": None}
         pass
@@ -802,7 +864,9 @@ def run_eval(config, iteration=None):
     return score
 
 
-def run_evaluation_loop(model, configs, output_dir_override=None, iteration=None, print_output=True):
+def run_evaluation_loop(
+    model, configs, output_dir_override=None, iteration=None, print_output=True
+):
     """
     Common evaluation loop used by both online evaluation during training and standalone evaluation.
 
@@ -849,11 +913,18 @@ def eval_tasks():
     args = get_args()
 
     def wrapped_model_provider(pre_process, post_process, add_encoder=True, add_decoder=True):
-        return model_provider(pre_process, post_process, add_encoder=add_encoder, add_decoder=add_decoder,
-                              parallel_output=False)
+        return model_provider(
+            pre_process,
+            post_process,
+            add_encoder=add_encoder,
+            add_decoder=add_decoder,
+            parallel_output=False,
+        )
 
     # Set up model and load checkpoint.
-    model = get_model(wrapped_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=False)
+    model = get_model(
+        wrapped_model_provider, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=False
+    )
 
     if args.load is not None:
         _ = load_checkpoint(model, None, None)
diff --git a/examples/multimodal/train.py b/examples/multimodal/train.py
index 01505d60e57..98536f72d1e 100644
--- a/examples/multimodal/train.py
+++ b/examples/multimodal/train.py
@@ -12,23 +12,23 @@
     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
 )
 
-from dataloader_provider import train_valid_test_dataloaders_provider, is_first_or_last_stage
+from dataloader_provider import is_first_or_last_stage, train_valid_test_dataloaders_provider
 from model import model_provider
 from multimodal_args import add_multimodal_extra_args
 
 from megatron.core import mpu, tensor_parallel
-from megatron.core.utils import nvtx_range_pop, nvtx_range_push
 from megatron.core.enums import ModelType
 from megatron.core.models.multimodal import context_parallel
 from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, LLaVAModel
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.parallel_state import (
-    get_tensor_model_parallel_rank,
     get_pipeline_model_parallel_world_size,
+    get_tensor_model_parallel_rank,
     is_pipeline_last_stage,
 )
+from megatron.core.utils import nvtx_range_pop, nvtx_range_push
 from megatron.training import get_args, get_timers, get_tokenizer, pretrain
-from megatron.training.utils import is_last_rank, get_batch_on_this_cp_rank
+from megatron.training.utils import get_batch_on_this_cp_rank, is_last_rank
 
 
 def get_batch(data_iterator, image_token_index, img_seq_len):
@@ -51,7 +51,16 @@ def get_batch(data_iterator, image_token_index, img_seq_len):
     pp_size = get_pipeline_model_parallel_world_size()
     if not is_first_or_last_stage(pp_size):
         # Note these are all set to None above.
-        return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params
+        return (
+            tokens,
+            labels,
+            loss_mask,
+            attention_mask,
+            position_ids,
+            imgs,
+            num_tiles,
+            packed_seq_params,
+        )
 
     # Broadcast data.
     nvtx_range_push("get_data")
@@ -74,7 +83,9 @@ def get_batch(data_iterator, image_token_index, img_seq_len):
         # FSDP can hang with text-only samples. A workaround is to run a valid dummy image through the vision
         # model and then add image embeddings with a zero multiplier.
         if args.use_torch_fsdp2:
-            imgs = torch.zeros((1, 3, args.img_h, args.img_w), dtype=torch.float32, device=data_text.device)
+            imgs = torch.zeros(
+                (1, 3, args.img_h, args.img_w), dtype=torch.float32, device=data_text.device
+            )
             num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
         else:
             # Similar workaround is not needed without FSDP and we can use an empty image.
@@ -130,13 +141,20 @@ def get_batch(data_iterator, image_token_index, img_seq_len):
 
         # CP expects sequence length is divisible by CP size so apply padding.
         mp_padding_needed = context_parallel.get_padding(
-            seq_len, args.context_parallel_size,
-            args.tensor_model_parallel_size, args.sequence_parallel,
+            seq_len,
+            args.context_parallel_size,
+            args.tensor_model_parallel_size,
+            args.sequence_parallel,
         )
-        tokens, position_ids, labels, loss_mask = [torch.nn.functional.pad(item, (0, mp_padding_needed)) for item in (tokens, position_ids, labels, loss_mask)]
+        tokens, position_ids, labels, loss_mask = [
+            torch.nn.functional.pad(item, (0, mp_padding_needed))
+            for item in (tokens, position_ids, labels, loss_mask)
+        ]
 
         # Get PackedSeqParams that indicate the amount of padding for TransformerEngine.
-        packed_seq_params = context_parallel.get_packed_seq_params(tokens, num_image_embeddings, mp_padding_needed, args.context_parallel_size, True)
+        packed_seq_params = context_parallel.get_packed_seq_params(
+            tokens, num_image_embeddings, mp_padding_needed, args.context_parallel_size, True
+        )
 
     return (
         tokens,
@@ -175,14 +193,15 @@ def get_mask_start_and_end_idx(arr):
     get_mask_start_and_end_idx(arr) = [(1, 1), (4, 5)]
     such that arr[1:1+1] = [1] and arr[4:5+1] = [1, 1]
     """
-    mask = (arr != 0)
+    mask = arr != 0
 
     mask_int = mask.int()
 
     diff = mask_int[1:] - mask_int[:-1]
     start_indices = (diff == 1).nonzero(as_tuple=False).flatten() + 1
     end_indices = (diff == -1).nonzero(as_tuple=False).flatten()
-    if len(mask)==0: return []
+    if len(mask) == 0:
+        return []
     if mask[0]:
         start_indices = torch.cat((torch.tensor([0], device=arr.device), start_indices))
     if mask[-1]:
@@ -211,8 +230,8 @@ def scaled_loss_func(loss_mask, output_tensor):
         turn_start_end_list = get_mask_start_and_end_idx(loss_mask[idx])
         for turn_start, turn_end in turn_start_end_list:
             # compute loss for each turn
-            loss_this_turn = loss_this_sample[turn_start:turn_end+1].sum()
-            assert (1 - loss_mask)[idx][turn_start:turn_end+1].sum() < 1.0
+            loss_this_turn = loss_this_sample[turn_start : turn_end + 1].sum()
+            assert (1 - loss_mask)[idx][turn_start : turn_end + 1].sum() < 1.0
             num_valid_labels_this_turn = turn_end - turn_start + 1
             loss_this_turn = loss_this_turn / num_valid_labels_this_turn
             loss_list.append(loss_this_turn)
@@ -231,7 +250,9 @@ def scaled_loss_func(loss_mask, output_tensor):
         total_tokens = loss_mask.sum()
         total_loss = torch.sum(losses.view(-1) * loss_mask)
     else:
-        raise RuntimeError("loss_list for loss scaling per conversation unexpectedly got empty list")
+        raise RuntimeError(
+            "loss_list for loss scaling per conversation unexpectedly got empty list"
+        )
 
     num_tokens = total_tokens.clone().detach().to(torch.int)
     reporting_loss = torch.cat([total_loss.clone().detach().view(1), num_tokens.view(1)])
@@ -276,7 +297,9 @@ def forward_step(data_iterator, model: LLaVAModel):
         images,
         num_image_tiles,
         packed_seq_params,
-    ) = get_batch(data_iterator, model.module.module.image_token_index, model.module.module.img_seq_len)
+    ) = get_batch(
+        data_iterator, model.module.module.image_token_index, model.module.module.img_seq_len
+    )
     timers('batch-generator').stop()
 
     output_tensor, loss_mask = model(
@@ -333,6 +356,7 @@ def run_online_eval(model):
         return []
 
     from config import EvaluationConfig
+
     # Import the common evaluation functions
     from run_text_generation import get_evaluation_configs, run_evaluation_loop
 
@@ -344,9 +368,11 @@ def run_online_eval(model):
     # We must write to a storage space that all ranks see.
     output_dir = os.path.join(args.save, "online_eval")
     os.makedirs(output_dir, exist_ok=True)
-    
+
     # Use the common evaluation loop
-    scores = run_evaluation_loop(model[0].module, configs, output_dir_override=output_dir, print_output=False)
+    scores = run_evaluation_loop(
+        model[0].module, configs, output_dir_override=output_dir, print_output=False
+    )
 
     return [scores]
 
@@ -364,6 +390,7 @@ def write_eval_to_tensorboard(data, iteration, writer, walltime=None):
 def write_online_eval_to_tensorboard(data, iteration, writer, walltime=None):
     """Write online evaluation data to Tensorboard."""
     import shutil
+
     args = get_args()
 
     # Define source and destination directories
diff --git a/examples/rl/environments/countdown/countdown.py b/examples/rl/environments/countdown/countdown.py
index c5ad57bb72d..acfabc46681 100644
--- a/examples/rl/environments/countdown/countdown.py
+++ b/examples/rl/environments/countdown/countdown.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # This file is adapted from code in https://github.com/Jiayi-Pan/TinyZero
 
 import re
diff --git a/examples/rl/environments/math/aime_agent.py b/examples/rl/environments/math/aime_agent.py
index 25d13b73f02..31f7ce8786f 100644
--- a/examples/rl/environments/math/aime_agent.py
+++ b/examples/rl/environments/math/aime_agent.py
@@ -44,4 +44,6 @@ async def get_prompt(self, validation=False) -> tuple[str, dict]:
         return prompt, golden
 
     async def get_reward(self, response, golden: dict, finish_reason: str) -> float:
-        return self.compute_score(response, golden, golden_key="Answer", finish_reason=finish_reason)
+        return self.compute_score(
+            response, golden, golden_key="Answer", finish_reason=finish_reason
+        )
diff --git a/examples/rl/environments/math/bigmath_agent.py b/examples/rl/environments/math/bigmath_agent.py
index 5895e07e996..c19572bddfb 100644
--- a/examples/rl/environments/math/bigmath_agent.py
+++ b/examples/rl/environments/math/bigmath_agent.py
@@ -37,4 +37,6 @@ async def get_prompt(self, validation=False) -> tuple[str, dict]:
         return prompt, golden
 
     async def get_reward(self, response, golden: dict, finish_reason: str) -> float:
-        return self.compute_score(response, golden, golden_key="answer", finish_reason=finish_reason)
+        return self.compute_score(
+            response, golden, golden_key="answer", finish_reason=finish_reason
+        )
diff --git a/examples/rl/environments/math/dapo_agent.py b/examples/rl/environments/math/dapo_agent.py
index 7339b384bc1..50db0e085d3 100644
--- a/examples/rl/environments/math/dapo_agent.py
+++ b/examples/rl/environments/math/dapo_agent.py
@@ -50,4 +50,6 @@ async def get_prompt(self, validation=False) -> tuple[str, dict]:
         return prompt, golden
 
     async def get_reward(self, response, golden: dict, finish_reason: str) -> float:
-        return self.compute_score(response, golden, golden_key="answer", finish_reason=finish_reason)
+        return self.compute_score(
+            response, golden, golden_key="answer", finish_reason=finish_reason
+        )
diff --git a/examples/rl/environments/math/gsm8k_agent.py b/examples/rl/environments/math/gsm8k_agent.py
index 278f88bd57c..5b5be42557e 100644
--- a/examples/rl/environments/math/gsm8k_agent.py
+++ b/examples/rl/environments/math/gsm8k_agent.py
@@ -23,18 +23,20 @@
 
 
 class GSM8KAgent(MathAgent):
-    def __init__(self,
+    def __init__(
+        self,
         answer_format: str = "boxed",
         format_reward: float = 0.0,
         negative_reward: float = 0.0,
         partial_end_reward: float = 0.0,
-        **kwargs):
+        **kwargs,
+    ):
         super().__init__(
             answer_format=answer_format,
             format_reward=format_reward,
             negative_reward=negative_reward,
             partial_end_reward=partial_end_reward,
-            **kwargs
+            **kwargs,
         )
         self.env_id: str = "gsm8k"
 
@@ -65,7 +67,9 @@ async def get_prompt(self, validation=False) -> tuple[str, dict]:
         return prompt, golden
 
     async def get_reward(self, response, golden: dict, finish_reason: str) -> float:
-        return self.compute_score(response, golden, golden_key="numeric_answer", finish_reason=finish_reason)
+        return self.compute_score(
+            response, golden, golden_key="numeric_answer", finish_reason=finish_reason
+        )
 
 
 # pytest
diff --git a/examples/rl/environments/math/math_agent.py b/examples/rl/environments/math/math_agent.py
index f5610c0d742..5f21a9aa41e 100644
--- a/examples/rl/environments/math/math_agent.py
+++ b/examples/rl/environments/math/math_agent.py
@@ -20,13 +20,16 @@
     MATHVERIFY_AVAILABLE
 ), "math_verify is not installed but now required. Install it using `pip install math-verify` to continue."
 
+
 class MathAgent(RewardOnlyAgent):
-    def __init__(self,
+    def __init__(
+        self,
         format_reward: float = 0.0,
         answer_format: str = "tagged",
         negative_reward: float = 0.0,
         partial_end_reward: float = 0.0,
-        **kwargs):
+        **kwargs,
+    ):
         """
         Args:
             format_reward (float): Reward given when the answer is in the expected format,
@@ -48,11 +51,7 @@ def __init__(self,
         self.partial_end_reward = partial_end_reward
 
     def compute_score(
-        self,
-        response: str,
-        golden: dict,
-        finish_reason: str,
-        golden_key: str = "answer",
+        self, response: str, golden: dict, finish_reason: str, golden_key: str = "answer"
     ) -> float:
         """Take a response and a golden answer and return a score. Supports tagged or boxed answers.
 
@@ -68,7 +67,7 @@ def compute_score(
             # Only consider the last occurrence
             last_match = answer_tag_match[-1]
             final_answer = last_match.group(1).strip()
-            after = response[last_match.end():].lstrip()
+            after = response[last_match.end() :].lstrip()
 
             try:
                 parsed_answer = parse(final_answer)
@@ -94,7 +93,7 @@ def compute_score(
             if boxed_match:
                 last_match = boxed_match[-1]
                 final_answer = last_match.group(1).strip()
-                after = response[last_match.end():].lstrip()
+                after = response[last_match.end() :].lstrip()
                 try:
                     parsed_answer = parse(final_answer)
                 except ValueError as e:
diff --git a/examples/rl/environments/math/openmath_agent.py b/examples/rl/environments/math/openmath_agent.py
index 98f9ae22d0c..b4a03586b6d 100644
--- a/examples/rl/environments/math/openmath_agent.py
+++ b/examples/rl/environments/math/openmath_agent.py
@@ -37,4 +37,6 @@ async def get_prompt(self, validation=False) -> tuple[str, dict]:
         return prompt, golden
 
     async def get_reward(self, response, golden: dict, finish_reason: str) -> float:
-        return self.compute_score(response, golden, golden_key="expected_answer", finish_reason=finish_reason)
+        return self.compute_score(
+            response, golden, golden_key="expected_answer", finish_reason=finish_reason
+        )
diff --git a/examples/run_simple_mcore_train_loop.py b/examples/run_simple_mcore_train_loop.py
index 1ba5e10cfc9..11e1494dfbf 100644
--- a/examples/run_simple_mcore_train_loop.py
+++ b/examples/run_simple_mcore_train_loop.py
@@ -1,31 +1,30 @@
 # Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 
 import os
+from functools import partial
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterator, Tuple
+
 import torch
 from torch.optim import Adam
 from torch.utils.data import DataLoader
-from functools import partial
-from pathlib import Path
-from typing import Any, Callable, Dict, Tuple, Iterator
-from megatron.core import parallel_state
-from megatron.core import dist_checkpointing
-from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
-from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.models.gpt.gpt_model import GPTModel
-from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
-from megatron.core.datasets.utils import compile_helpers
-from megatron.core.datasets.blended_megatron_dataset_builder import (
-    BlendedMegatronDatasetBuilder,
-)
+
+from megatron.core import dist_checkpointing, parallel_state
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.gpt_dataset import GPTDatasetConfig, MockGPTDataset
-from megatron.core.distributed import DistributedDataParallel
-from megatron.core.distributed import DistributedDataParallelConfig
+from megatron.core.datasets.utils import compile_helpers
+from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
 from megatron.core.distributed.finalize_model_grads import finalize_model_grads
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.tokenizers import MegatronTokenizer
+from megatron.core.transformer.transformer_config import TransformerConfig
 
 _SEQUENCE_LENGTH: int = 64
 
+
 def initialize_distributed(
     tensor_model_parallel_size: int = 1, pipeline_model_parallel_size: int = 1
 ) -> None:
@@ -44,9 +43,7 @@ def initialize_distributed(
     local_rank: int = int(os.environ["LOCAL_RANK"])
 
     torch.cuda.set_device(local_rank)
-    torch.distributed.init_process_group(
-        backend="nccl", rank=rank, world_size=world_size
-    )
+    torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size)
 
     # Megatron core distributed training initialization
     parallel_state.initialize_model_parallel(
@@ -104,8 +101,7 @@ def get_train_data_iterator() -> Iterator:
         reset_attention_mask=False,
         eod_mask_loss=False,
         tokenizer=MegatronTokenizer.from_pretrained(
-            metadata_path={"library": "null-text"},
-            vocab_size=_SEQUENCE_LENGTH,
+            metadata_path={"library": "null-text"}, vocab_size=_SEQUENCE_LENGTH
         ),
         mid_level_dataset_surplus=0.005,
     )
@@ -159,16 +155,12 @@ def loss_func(
     labels: torch.Tensor = data["labels"].to(device)
     loss_mask: torch.Tensor = data["loss_mask"].to(device)
 
-    output_tensor: torch.Tensor = model(
-        tokens, position_ids, attention_mask, labels=labels
-    )
+    output_tensor: torch.Tensor = model(tokens, position_ids, attention_mask, labels=labels)
 
     return output_tensor, partial(loss_func, loss_mask)
 
 
-def save_distributed_checkpoint(
-    checkpoint_path: str, gpt_model: torch.nn.Module
-) -> None:
+def save_distributed_checkpoint(checkpoint_path: str, gpt_model: torch.nn.Module) -> None:
     """
     Save a distributed checkpoint of the GPT model using Megatron-Core utilities.
 
@@ -181,13 +173,9 @@ def save_distributed_checkpoint(
         gpt_model (torch.nn.Module): The GPT model to checkpoint (may be wrapped with DDP).
     """
     # Access underlying model if wrapped with DDP
-    model: torch.nn.Module = (
-        gpt_model.module if hasattr(gpt_model, "module") else gpt_model
-    )
+    model: torch.nn.Module = gpt_model.module if hasattr(gpt_model, "module") else gpt_model
     sharded_state_dict: Dict = model.sharded_state_dict(prefix="")
-    dist_checkpointing.save(
-        sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path
-    )
+    dist_checkpointing.save(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
 
 
 def load_distributed_checkpoint(
@@ -207,9 +195,7 @@ def load_distributed_checkpoint(
         torch.nn.Module: The model with loaded checkpoint weights.
     """
     # Access underlying model if wrapped with DDP
-    model: torch.nn.Module = (
-        gpt_model.module if hasattr(gpt_model, "module") else gpt_model
-    )
+    model: torch.nn.Module = gpt_model.module if hasattr(gpt_model, "module") else gpt_model
     sharded_state_dict: Dict = model.sharded_state_dict(prefix="")
     checkpoint: Dict = dist_checkpointing.load(
         sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path
@@ -230,15 +216,9 @@ def load_distributed_checkpoint(
     # This provides the finish_grad_sync() method required by finalize_model_grads().
     config: TransformerConfig = gpt_model.config
     ddp_config: DistributedDataParallelConfig = DistributedDataParallelConfig(
-        grad_reduce_in_fp32=False,
-        overlap_grad_reduce=False,
-        use_distributed_optimizer=False,
-    )
-    gpt_model = DistributedDataParallel(
-        config=config,
-        ddp_config=ddp_config,
-        module=gpt_model,
+        grad_reduce_in_fp32=False, overlap_grad_reduce=False, use_distributed_optimizer=False
     )
+    gpt_model = DistributedDataParallel(config=config, ddp_config=ddp_config, module=gpt_model)
 
     optim: Adam = Adam(gpt_model.parameters())
 
@@ -276,8 +256,6 @@ def load_distributed_checkpoint(
     save_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
 
     # Loading the model
-    gpt_model = load_distributed_checkpoint(
-        gpt_model=gpt_model, checkpoint_path=ckpt_path
-    )
+    gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
     gpt_model.to(device)
     print("Successfully loaded the model")
diff --git a/gpt_builders.py b/gpt_builders.py
index 24b5f89d311..4f3f983bc5c 100644
--- a/gpt_builders.py
+++ b/gpt_builders.py
@@ -11,6 +11,7 @@
 )
 from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
     get_transformer_block_with_experimental_attention_variant_spec,
+    get_transformer_layer_with_experimental_attention_variant_spec,
 )
 from megatron.core.models.gpt.heterogeneous.heterogeneous_layer_specs import (
     get_gpt_heterogeneous_layer_spec,
@@ -70,23 +71,29 @@ def gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_
         mtp_block_spec = None
         if args.mtp_num_layers is not None:
             assert not (config.transformer_impl == "inference_optimized")
-            if (
-                hasattr(transformer_layer_spec, 'layer_specs')
-                and len(transformer_layer_spec.layer_specs) == 0
-            ):
-                # Get the decoder layer spec explicitly if no decoder layer in the last stage,
-                # Only happens with block spec (TransformerBlockSubmodules) when using MoE.
-                transformer_layer_spec_for_mtp = _get_transformer_layer_spec(use_te, config)
+            # Get GPT decoder layer specs for the model.
+            if args.spec is not None:
+                mtp_transformer_layer_spec = import_module(args.spec)
             else:
                 # Define the decoder block spec
-                decoder_layer_specs = get_gpt_decoder_layer_specs(
-                    config, use_transformer_engine=use_te, normalization=args.normalization, qk_l2_norm=args.qk_l2_norm, vp_stage=vp_stage
-                )
-                transformer_layer_spec_for_mtp = decoder_layer_specs[-1]
+                if args.experimental_attention_variant is not None:
+                    decoder_layer_specs = (
+                        get_transformer_layer_with_experimental_attention_variant_spec(
+                            config=config
+                        )
+                    )
+                else:
+                    decoder_layer_specs = get_gpt_decoder_layer_specs(
+                        config,
+                        use_transformer_engine=use_te,
+                        normalization=args.normalization,
+                        qk_l2_norm=args.qk_l2_norm,
+                    )
+                mtp_transformer_layer_spec = decoder_layer_specs[-1]
             # Use spec of the last layer in decoder block as spec of the transformer layer in MTP
             mtp_block_spec = get_gpt_mtp_block_spec(
                 config,
-                transformer_layer_spec_for_mtp,
+                mtp_transformer_layer_spec,
                 use_transformer_engine=use_te,
                 vp_stage=vp_stage,
             )
@@ -135,7 +142,10 @@ def _get_transformer_layer_spec(use_te, config):
             use_te_activation_func=config.use_te_activation_func,
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
+            fallback_to_eager_attn=config.fallback_to_eager_attn,
+            enable_hyper_connection=config.enable_hyper_connections,
             mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
+            dense_grouped_gemm=config.dense_grouped_gemm,
         )
     elif config.transformer_impl == "inference_optimized":
         return get_gpt_layer_with_inference_spec(
@@ -154,4 +164,5 @@ def _get_transformer_layer_spec(use_te, config):
             use_kitchen=config.use_kitchen,
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
+            enable_hyper_connection=config.enable_hyper_connections,
         )
diff --git a/hello_world b/hello_world
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/megatron/core/datasets/blended_megatron_dataset_builder.py b/megatron/core/datasets/blended_megatron_dataset_builder.py
index f728fe10d03..6cb75ab5104 100644
--- a/megatron/core/datasets/blended_megatron_dataset_builder.py
+++ b/megatron/core/datasets/blended_megatron_dataset_builder.py
@@ -48,11 +48,13 @@ def __init__(
         sizes: List[int],
         is_built_on_rank: Callable,
         config: BlendedMegatronDatasetConfig,
+        vp_stage: Optional[int] = None,
     ):
         self.cls = cls
         self.sizes = sizes
         self.is_built_on_rank = is_built_on_rank
         self.config = config
+        self.vp_stage = vp_stage
 
         log_single_rank(
             logger,
diff --git a/megatron/core/datasets/data_schedule.py b/megatron/core/datasets/data_schedule.py
index 0f016473b6a..14861c0ef41 100644
--- a/megatron/core/datasets/data_schedule.py
+++ b/megatron/core/datasets/data_schedule.py
@@ -1,301 +1,661 @@
 # Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved.
 
-from typing import Any, List, Optional
+from typing import Dict, Optional, Type
 
 import torch
 
 from megatron.core import parallel_state
-from megatron.core.pipeline_parallel.hybrid_cp_schedule import BalancedCPScheduler
+from megatron.core.datasets.data_schedule_utils import (
+    align_sample_id_groups,
+    broadcast_scalars,
+    broadcast_tensor,
+    broadcast_to_pp_group,
+    build_packed_microbatches,
+    create_data_iterator,
+    dcp_get_total_workload,
+    dcp_gpus_needed,
+    dcp_make_buckets_equal,
+    get_batch_and_global_seqlens,
+    get_cp_slice_for_thd,
+    next_hdp_group,
+    reroute_samples_to_dcp_ranks,
+)
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.process_groups_config import ProcessGroupCollection
 
 
-class HybridCPDataLoaderWrapper:
-    """
-    A wrapper class that wraps around an existing data_iterator.
-    For every __next__ call,
-    1. Each DP rank pulls a batch of packed samples.
-    2. Extracts the sequence lengths of each sub-sample and all-gathers across the DP group.
-    3. Schedules the sub-samples to the DPxCP ranks using the BalancedCPScheduler.
-    4. Based on the schedule, reroutes the sub-samples to the correct rank using all-to-all.
-    5. Returns the assigned sub-samples to this rank.
-
-    Args:
-        data_iterator: The original data_iterator to wrap around
-        config: The config object containing the max_seqlen_per_dp_cp_rank
-        dp_cp_group: Data parallel context parallel group.
-    """
+class BasePackingScheduler:
+    """Base class for sequence packing schedulers."""
 
     def __init__(
-        self, data_iterator, config, pg_collection: Optional[ProcessGroupCollection] = None
+        self,
+        max_seqlen_per_dp_cp_rank: int,
+        cp_size: int,
+        dp_size: int,
+        microbatch_group_size_per_vp_stage: Optional[int],
     ):
-        self.data_iterator = data_iterator
-        self.config = config
-        if pg_collection is None:
-            self.dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
-            self.dp_group = parallel_state.get_data_parallel_group()
-            self.tp_group = parallel_state.get_tensor_model_parallel_group()
-        else:
-            self.dp_cp_group = pg_collection.dp_cp
-            self.dp_group = pg_collection.dp
-            self.tp_group = pg_collection.tp
-        assert (
-            self.dp_cp_group is not None and self.dp_group is not None and self.tp_group is not None
-        ), "dp_cp_group, dp_group, tp_group must not be None when using hybrid context parallel"
-
-        self.cp_balancing_scheduler = BalancedCPScheduler(
-            max_seq_len_per_rank=self.config.max_seqlen_per_dp_cp_rank, dp_cp_group=self.dp_cp_group
-        )
-
-        self.total_hdp_gpus = self.dp_cp_group.size()
+        """
+        Args:
+            max_seqlen_per_dp_cp_rank: The maximum sequence length per DPxCP rank.
+            cp_size: The context parallel size.
+            dp_size: The data parallel size.
+            microbatch_group_size_per_vp_stage: The microbatch group size per virtual
+            pipeline stage, only used when enabling VPP, otherwise None.
+        """
+        self.max_seqlen_per_dp_cp_rank = max_seqlen_per_dp_cp_rank
+        self.cp_size = cp_size
+        self.dp_size = dp_size
+        self.microbatch_group_size_per_vp_stage = microbatch_group_size_per_vp_stage
+
+    def get_required_sample_keys(self):
+        """Return the required key of each batch."""
+        raise NotImplementedError
+
+    def get_groups_and_subsamples(self, sample_id_seqlens):
+        """schedule the samples into groups"""
+        raise NotImplementedError
+
+    def run(
+        self,
+        data_iterator,
+        num_microbatches,
+        dp_group,
+        tp_group,
+        pp_group,
+        dp_cp_group,
+        dev,
+        config,
+    ):
+        """
+        Run the scheduler and return the new data_iterator.
+
+        Args:
+            data_iterator: The data iterator.
+            num_microbatches: The number of microbatches to fetch.
+            dp_group: Data parallel process group.
+            tp_group: Tensor parallel process group.
+            pp_group: Pipeline parallel process group.
+            dp_cp_group: Data parallel + context parallel process group.
+            dev: CUDA device.
+            config: Model parallel config.
 
-    def __iter__(self):
-        """Return self as an iterator."""
-        return self
+        Returns:
+            new_data_iterator: The new data iterator (or list for VPP).
+            num_micro_batches: Number of micro batches after scheduling.
+            seqlen_sum_this_global_batch: Total tokens for FLOPs calculation.
+            seqlen_squared_sum_this_global_batch: Sum of squared seqlens for FLOPs.
+        """
+        raise NotImplementedError
+
+
+class DpBalancedScheduler(BasePackingScheduler):
+    """Packs sequences in their original order until reaching the max limit of sequence length."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.max_seq_len_all_ranks = self.max_seqlen_per_dp_cp_rank * self.cp_size
+        self.is_dynamic_cp = False
+
+    def get_required_sample_keys(self):
+        """Return the required key of each batch."""
+        return [
+            "tokens",
+            "labels",
+            "loss_mask",
+            "position_ids",
+            "original_seq_len",  # Length of the original sequence length, should be a gpu tensor.
+            "padded_seq_len",  # Length of the padded sequence length, should be a gpu tensor.
+        ]
 
-    def get_global_seqlens(self, subsample_seqlens: torch.Tensor) -> List[int]:
+    def get_groups_and_subsamples(self, sample_id_seqlens):
+        """
+        Packs sequences in their original order until reaching the max limit of sequence length.
+        """
+        sample_id_groups = []
+        packed_id_groups = []
+        sum_seqlen = 0
+        single_microbatch = []
+
+        for i in range(len(sample_id_seqlens)):
+            if sum_seqlen + sample_id_seqlens[i][1] <= self.max_seq_len_all_ranks:
+                single_microbatch.append(i)
+                sum_seqlen += sample_id_seqlens[i][1]
+            else:
+                packed_id_groups.append(single_microbatch)
+                single_microbatch = [i]
+                sum_seqlen = sample_id_seqlens[i][1]
+        if len(single_microbatch) > 0:
+            packed_id_groups.append(single_microbatch)
+
+        # we want the number of packed sequences to be multiple of dp_size
+        # so we move few samples from previous microbatch
+        # to the end of the microbatches if needed
+        num_packed_sequence = len(packed_id_groups)
+
+        # when enabling vpp, we want the number of packed sequences to be
+        # multiple of dp_size * microbatch_group_size_per_vp_stage
+        multiple = self.dp_size * (
+            self.microbatch_group_size_per_vp_stage
+            if self.microbatch_group_size_per_vp_stage is not None
+            else 1
+        )
+        if num_packed_sequence % multiple != 0:
+            remainder = num_packed_sequence % multiple
+            num_to_move = multiple - remainder
+            i = num_packed_sequence - 1
+            while num_to_move > 0:
+                assert i >= 0, "Not enough samples to move"
+                if len(packed_id_groups[i]) > 1:
+                    seq_id = packed_id_groups[i].pop()
+                    packed_id_groups.append([seq_id])
+                    num_to_move -= 1
+                else:
+                    i -= 1
+
+        num_micro_batches = int(len(packed_id_groups) / self.dp_size)
+        for i in range(num_micro_batches):
+            sample_id_groups.append([])
+            for j in range(self.cp_size * self.dp_size):
+                seq_id = int(i * self.dp_size + j / self.cp_size)
+                sample_id_groups[i].append(packed_id_groups[seq_id])
+        return sample_id_groups
+
+    def run(
+        self,
+        data_iterator,
+        num_microbatches: int,
+        dp_group,
+        tp_group,
+        pp_group,
+        dp_cp_group,
+        dev: torch.device,
+        config,
+    ):
         """
-        Gathers the sequence lengths of all subsamples from all DP ranks.
-        Each DP rank loads the same number of microbatches but each microbatch
-        may have a different number of subsamples.
+        Run the complete scheduling pipeline.
+
+        Steps:
+            1. Fetch batches and gather global sequence lengths
+            2. Check required sample keys
+            3. Schedule samples into groups
+            4. Reroute samples to DCP ranks
+            5. Build packed microbatches
+            6. Calculate FLOPs info
+            7. Broadcast to PP group (for middle PP stages)
+            8. Broadcast to TP group (for non-TP-0 ranks)
+            9. Handle VPP if enabled
+
+        Args:
+            data_iterator: The data iterator.
+            num_microbatches: The number of microbatches to fetch.
+            dp_group: Data parallel process group.
+            tp_group: Tensor parallel process group.
+            pp_group: Pipeline parallel process group.
+            dp_cp_group: Data parallel + context parallel process group.
+            dev: CUDA device.
+            config: Model parallel config.
 
-        We find the number of subsamples each rank holds and then gather the
-        sequence lengths of all subsamples from all ranks.
+        Returns:
+            new_data_iterator: The new data iterator (or list for VPP).
+            num_micro_batches: Number of micro batches after scheduling.
+            seqlen_sum_this_global_batch: Total tokens for FLOPs calculation.
+            seqlen_squared_sum_this_global_batch: Sum of squared seqlens for FLOPs.
         """
-        # Collect the number of subsamples from all ranks
-        local_len = torch.tensor([subsample_seqlens.shape[0]], dtype=torch.int32).cuda()
-        dp_subsample_count = [torch.zeros_like(local_len) for _ in range(self.dp_group.size())]
-        torch.distributed.all_gather(dp_subsample_count, local_len, group=self.dp_group)
 
-        # Find the max number of subsamples across all ranks and pad subsample_seqlens to max length
-        dp_subsample_counts = torch.stack(dp_subsample_count, dim=0).cpu().view(-1)
-        max_sub_samples = int(dp_subsample_counts.max().item())
+        total_dcp_gpus = dp_cp_group.size()
+
+        # Handle VPP: extract the correct data_iterator for this PP stage.
+        # When VPP is enabled, data_iterator is a list with one entry per VPP stage.
+        # We only need one data_iterator to run the schedule (all VPP stages on the
+        # same PP rank share the same underlying dataset), so pick the first non-None.
+        # Record which VPP stages had data so create_data_iterator knows which ones
+        # need full samples vs metadata only.
+        vpp_has_data = None
+        if (
+            config.virtual_pipeline_model_parallel_size is not None
+            and config.virtual_pipeline_model_parallel_size > 1
+        ):
+            assert len(data_iterator) == config.virtual_pipeline_model_parallel_size
+            vpp_has_data = [di is not None for di in data_iterator]
+            extracted = None
+            for di in data_iterator:
+                if di is not None:
+                    extracted = di
+                    break
+            data_iterator = extracted
+
+        # data_iterator is not None on TP rank 0 for PP stages that need data
+        # (first stage, last stage, or any stage with MTP).
+        if data_iterator is not None:
+            assert tp_group.rank() == 0, "Only TP rank 0 should have data_iterator"
+
+            # Step 1: Fetch batches and gather global sequence lengths
+            batch, global_id_seqlens, global_ids_this_rank, offsets, seqlens_gathered = (
+                get_batch_and_global_seqlens(data_iterator, num_microbatches, dp_group)
+            )
 
-        if local_len.item() < max_sub_samples:
-            subsample_seqlens_padded = torch.cat(
-                [
-                    subsample_seqlens,
-                    torch.zeros(max_sub_samples - local_len.item(), dtype=torch.int32).cuda(),
-                ],
-                dim=0,
+            # Step 2: Check required sample keys
+            for key in self.get_required_sample_keys():
+                assert (
+                    key in batch[0]
+                ), f"Batch missing required key {key}, provided keys: {batch[0].keys()}"
+
+            # Step 3: Schedule samples into groups
+            sample_id_groups = self.get_groups_and_subsamples(global_id_seqlens)
+
+            # Validate scheduling result
+            set_gbs = set()
+            for group in sample_id_groups:
+                for sub in group:
+                    set_gbs.update(sub)
+            assert len(set_gbs) == len(global_id_seqlens), (
+                f"set_gbs length: {len(set_gbs)} != "
+                f"global_id_seqlens length: {len(global_id_seqlens)}"
             )
-        else:
-            subsample_seqlens_padded = subsample_seqlens
 
-        # Gather the subsample_seqlens from all ranks
-        seqlens_gathered = [
-            torch.empty_like(subsample_seqlens_padded) for _ in range(self.dp_group.size())
-        ]
-        torch.distributed.all_gather(
-            seqlens_gathered, subsample_seqlens_padded, group=self.dp_group
-        )
+            # Step 4: Reroute samples to DCP ranks
+            samples_this_rank_with_id = reroute_samples_to_dcp_ranks(
+                batch,
+                global_ids_this_rank,
+                global_id_seqlens,
+                sample_id_groups,
+                offsets,
+                dp_group,
+                tp_group,
+                dp_cp_group,
+                total_dcp_gpus,
+            )
 
-        # Trim each seqlens_gathered to the length of the correct sample
-        for dp_rank, seqlen in enumerate(seqlens_gathered):
-            seqlens_gathered[dp_rank] = seqlen[: dp_subsample_counts[dp_rank]]
+            dcp_rank = dp_cp_group.rank()
+            num_micro_batches = len(sample_id_groups)
 
-        seqlens_gathered = torch.cat(seqlens_gathered, dim=0)
-        seqlens_gathered = seqlens_gathered.cpu().tolist()
+            # Step 5: Build packed microbatches
+            new_samples = build_packed_microbatches(
+                samples_this_rank_with_id, sample_id_groups, dcp_rank, dev, self.is_dynamic_cp
+            )
 
-        # Calculate the offsets to assign unique global ID to each subsample.
-        csum = torch.cumsum(dp_subsample_counts, dim=0, dtype=torch.int32)
-        offsets = torch.cat([torch.zeros(1, dtype=torch.int32), csum[:-1]], dim=0)
+            # Step 6: Calculate FLOPs info
+            seqlen_sum_this_global_batch = float(sum(seqlens_gathered))
+            seqlen_squared_sum_this_global_batch = float(
+                sum(seqlen**2 for seqlen in seqlens_gathered)
+            )
+        else:
+            (
+                new_samples,
+                num_micro_batches,
+                seqlen_sum_this_global_batch,
+                seqlen_squared_sum_this_global_batch,
+            ) = (None, None, None, None)
+
+        # Step 7: Broadcast to PP group (for middle PP stages)
+        if tp_group.rank() == 0:
+            (
+                new_samples,
+                num_micro_batches,
+                seqlen_sum_this_global_batch,
+                seqlen_squared_sum_this_global_batch,
+            ) = broadcast_to_pp_group(
+                new_samples,
+                num_micro_batches,
+                seqlen_sum_this_global_batch,
+                seqlen_squared_sum_this_global_batch,
+                pp_group,
+                dev,
+                is_dynamic_cp=self.is_dynamic_cp,
+            )
 
-        return seqlens_gathered, offsets
+        # Step 8: Broadcast to TP group (for non-TP-0 ranks)
+        (num_micro_batches, seqlen_sum_this_global_batch, seqlen_squared_sum_this_global_batch) = (
+            broadcast_scalars(
+                [
+                    num_micro_batches,
+                    seqlen_sum_this_global_batch,
+                    seqlen_squared_sum_this_global_batch,
+                ],
+                tp_group,
+                dev,
+            )
+        )
+        num_micro_batches = int(num_micro_batches)
 
-    def get_global_id_seqlens(self, num_local_subsamples, offsets, seqlens_gathered):
-        """
-        Calculates the global ID for each subsample.
+        # Step 9: create data_iterator and handle VPP if enabled
+        new_data_iterator = create_data_iterator(
+            new_samples, tp_group, config, vpp_has_data, self.is_dynamic_cp
+        )
 
-        We assign a unique global ID to each subsample.
+        return (
+            new_data_iterator,
+            num_micro_batches,
+            seqlen_sum_this_global_batch,
+            seqlen_squared_sum_this_global_batch,
+        )
 
-        Returns:
-        global_id_seqlens: list of (global_id, seqlen) tuples for scheduling.
-        global_ids_this_rank: list of global IDs locally present on this rank.
-        """
-        dp_rank = self.dp_group.rank()
-        global_ids = torch.arange(len(seqlens_gathered), dtype=torch.int32).cuda()
-        # Create a list of (global_id, seqlen) tuples for scheduling
-        global_id_seqlens = [(i, seqlens_gathered[i]) for i in range(len(global_ids))]
-        # Get the global IDs locally present on this rank
-        global_ids_this_rank = global_ids[
-            offsets[dp_rank] : offsets[dp_rank] + num_local_subsamples
-        ]
 
-        return global_id_seqlens, global_ids_this_rank
+class DefaultDynamicCPScheduler(DpBalancedScheduler):
+    """
+    Dynamic CP scheduler that balances workload across variable CP sizes.
+    """
 
-    def _gid_to_src_rank(self, gid: int, offsets: List[int]) -> int:
-        dp_src_rank = torch.bucketize(gid, offsets[1:] - 1)
-        # Since the torch.distributed.get_process_group_ranks
-        # provides the global rank, we need to consider TP
-        hdp_rank = (
-            torch.distributed.get_process_group_ranks(self.dp_group)[dp_src_rank]
-            // self.tp_group.size()
-        )
-        return hdp_rank
+    def __init__(self, *args, min_cp_size=1, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_dynamic_cp = True
+        self.max_seq_len_per_rank = self.max_seqlen_per_dp_cp_rank
+        self.total_hdp_gpus = self.dp_size * self.cp_size
+        self.min_cp_size = min_cp_size
 
-    def reroute_samples_to_hdp_ranks(
-        self, batch, global_ids_this_rank, global_id_seqlens, sample_id_groups, offsets
-    ):
+    def get_groups_and_subsamples(self, sample_id_seqlens):
         """
-        Reroutes the sub-samples to the correct rank after scheduling.
-
-        For each key in the batch dict, we perform an all-to-all communication
-        to transfer the data to the correct ranks.
-        Since all CP ranks within a DP group have the same data, we only need
-        to transfer data between matching CP ranks.
+        This function recursively forms groups of sub-samples such that all DPxCP ranks
+        have a roughly balanced workload in the group.
         """
-        gid2local_id = {int(gid): i for i, gid in enumerate(global_ids_this_rank)}
-        hdp_rank = self.dp_cp_group.rank()
-        dp_ranks = torch.distributed.get_process_group_ranks(self.dp_group)
-        # Here we actually want to get the DP group's rank within the HDP group,
-        # we need to consider TP
-        dp_ranks = [r // self.tp_group.size() for r in dp_ranks]
-
-        data_keys = batch[0].keys()
-
-        # Create the send plan
-        combined_sample_id_groups: List[List[int]] = [[] for _ in range(self.total_hdp_gpus)]
-
-        for d in range(self.total_hdp_gpus):
-            for sample_id_group in sample_id_groups:
-                combined_sample_id_groups[d].extend(sample_id_group[d])
-
-        for dest_rank in range(self.total_hdp_gpus):
-            combined_sample_id_groups[dest_rank].sort()
-
-        # Filter out samples that are not present on this rank
-        send_ids_sorted = [
-            gid
-            for d in dp_ranks
-            for gid in combined_sample_id_groups[d]
-            if gid in global_ids_this_rank
-        ]
-        # send_counts = [len(combined_sample_id_groups[d]) for d in range(self.total_hdp_gpus)]
-
-        send_lens_split = [0] * self.total_hdp_gpus
-        for dest_rank in range(self.total_hdp_gpus):
-            if dest_rank in dp_ranks:
-                send_lens_split[dest_rank] = sum(
-                    [
-                        global_id_seqlens[gid][1]
-                        for gid in combined_sample_id_groups[dest_rank]
-                        if gid in global_ids_this_rank
-                    ]
-                )
-            else:
-                # We only need to share local data with DP ranks that have different data.
-                send_lens_split[dest_rank] = 0
-
-        # Create the recv plan
-        recv_sample_id_groups = [[] for _ in range(self.total_hdp_gpus)]
-        for gid in combined_sample_id_groups[hdp_rank]:
-            src_rank = self._gid_to_src_rank(gid, offsets)
-            recv_sample_id_groups[src_rank].append(gid)
-
-        recv_lens_split = [0] * self.total_hdp_gpus
-        for src_rank in range(self.total_hdp_gpus):
-            recv_lens_split[src_rank] = sum(
-                [global_id_seqlens[gid][1] for gid in recv_sample_id_groups[src_rank]]
-            )
-
-        recv_ids_sorted = [
-            gid for d in range(self.total_hdp_gpus) for gid in recv_sample_id_groups[d]
-        ]
-        recv_counts = [len(recv_sample_id_groups[d]) for d in range(self.total_hdp_gpus)]
-
-        recv_samples = [{k: None for k in data_keys} for _ in range(sum(recv_counts))]
-
-        def _pack_sample_by_key(key: str) -> torch.Tensor:
-            flattened_tensors = []
-            for gid in send_ids_sorted:
-                t = batch[gid2local_id[gid]][key].to(torch.cuda.current_device(), non_blocking=True)
-                flattened_tensors.append(t)
-            return (
-                torch.cat(flattened_tensors, dim=0)
-                if flattened_tensors
-                else torch.empty(0, device=torch.cuda.current_device(), dtype=batch[0][key].dtype)
-            )
+        mslpr = self.max_seq_len_per_rank
+        min_cp = self.min_cp_size
+        workload_fn = lambda seq_len, cp_size=None: dcp_get_total_workload(
+            seq_len, mslpr, cp_size, min_cp
+        )
+        gpus_fn = lambda seq_len: dcp_gpus_needed(seq_len, mslpr, min_cp)
+        buckets_fn = lambda sample_seqlens, compute_est: dcp_make_buckets_equal(
+            sample_seqlens, compute_est, mslpr, min_cp
+        )
 
-        def _unpack_sample_by_key(key: str, recv_tensor: torch.Tensor):
-            cursor = 0
-            for i, gid in enumerate(recv_ids_sorted):
-                sample_len = global_id_seqlens[gid][1]
-                recv_samples[i][key] = recv_tensor[cursor : cursor + sample_len]
-                cursor += sample_len
-
-        for key in data_keys:
-            send_tensor = _pack_sample_by_key(key)
-            recv_tensor = torch.empty(
-                sum(recv_lens_split), device=torch.cuda.current_device(), dtype=send_tensor.dtype
+        groups = []
+        sample_id_groups = []
+        sample_id_seqlens = sorted(sample_id_seqlens, key=lambda x: x[1], reverse=True)
+        while sample_id_seqlens:
+            mb, sample_id_seqlens, exec_times, sample_ids = next_hdp_group(
+                sample_id_seqlens,
+                workload_fn,
+                self.total_hdp_gpus,
+                gpus_needed_fn=gpus_fn,
+                make_buckets_equal_fn=buckets_fn,
+                max_seq_len_per_rank=mslpr,
+                get_total_workload_fn=workload_fn,
             )
-            torch.distributed.all_to_all_single(
-                output=recv_tensor,
-                input=send_tensor,
-                output_split_sizes=recv_lens_split,
-                input_split_sizes=send_lens_split,
-                group=self.dp_cp_group,
+            groups.append(mb)
+            sample_id_groups.append(sample_ids)
+
+        if (
+            self.microbatch_group_size_per_vp_stage is not None
+            and self.microbatch_group_size_per_vp_stage > 1
+        ):
+            sample_id_groups = align_sample_id_groups(
+                sample_id_groups, self.microbatch_group_size_per_vp_stage
             )
-            _unpack_sample_by_key(key, recv_tensor)
 
-        recv_sample_with_id = {
-            recv_id: recv_samples[i] for i, recv_id in enumerate(recv_ids_sorted)
-        }
-        return recv_sample_with_id
+        return sample_id_groups
 
-    def unpack_batch(self, batch):
-        """
-        Unpacks the packed samples into a list of sub-samples.
-        Since each sub-sample may be routed to different DPxCP ranks,
-        we unpack the sample here to avoid unnecessarily transferring
-        the entire packed sample.
-        """
-        batch_unpacked = []
-        for sample in batch:
-            for sub_sample in range(sample["cu_seqlens"].shape[0] - 1):
-                sub_sample_dict = {}
-                start_idx = sample["cu_seqlens"][sub_sample]
-                end_idx = sample["cu_seqlens"][sub_sample + 1]
-                if end_idx - start_idx == 0:
-                    continue
-                for key in sample.keys():
-                    if key in ["cu_seqlens", "batch_idx", "max_seqlen"]:
-                        continue
-                    sub_sample_dict[key] = sample[key][start_idx:end_idx]
-                batch_unpacked.append(sub_sample_dict)
-        return batch_unpacked
-
-    def __next__(self) -> Any:
-        """
-        Get the next item from the dataset, pull scheduling metadata and return it.
-        """
-        if self.data_iterator is None:
-            # TP0 reads from data_iterator, others receive via broadcast.
-            return None, None
-        else:
-            batch = next(self.data_iterator)
-        subsample_seqlens = []
-        for sample in batch:
-            subsample_seqlens.extend(
-                [
-                    int(sample["cu_seqlens"][i + 1] - sample["cu_seqlens"][i])
-                    for i in range(0, sample["cu_seqlens"].shape[0] - 1)
-                ]
-            )
-        subsample_seqlens = torch.tensor(subsample_seqlens, dtype=torch.int32).cuda()
-        subsample_seqlens = subsample_seqlens[subsample_seqlens != 0]
 
-        seqlens_gathered, offsets = self.get_global_seqlens(subsample_seqlens)
+scheduler_map: Dict[str, Type[BasePackingScheduler]] = {
+    "dp_balanced": DpBalancedScheduler,
+    "default_dynamic_cp": DefaultDynamicCPScheduler,
+}
 
-        global_id_seqlens, global_ids_this_rank = self.get_global_id_seqlens(
-            subsample_seqlens.shape[0], offsets, seqlens_gathered
-        )
 
-        groups, sample_id_groups = self.cp_balancing_scheduler.get_groups_and_subsamples(
-            global_id_seqlens, self.config
-        )
+def wrap_data_iterator(
+    data_iterator, config, num_microbatches, pg_collection: Optional[ProcessGroupCollection] = None
+):
+    """
+    A wrapper function that wraps around an existing data_iterator
+    and return the num_micro_batches for sequence packing.
 
-        batch = self.unpack_batch(batch)
-        samples_this_rank_with_id = self.reroute_samples_to_hdp_ranks(
-            batch, global_ids_this_rank, global_id_seqlens, sample_id_groups, offsets
+    Args:
+        data_iterator: The original data_iterator to wrap around
+        config: The config object containing the max_seqlen_per_dp_cp_rank
+        dp_cp_group: Data parallel context parallel group.
+        pg_collection: The process group collection.
+    """
+
+    if pg_collection is None:
+        dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+        dp_group = parallel_state.get_data_parallel_group()
+        tp_group = parallel_state.get_tensor_model_parallel_group()
+        pp_group = parallel_state.get_pipeline_model_parallel_group()
+    else:
+        dp_cp_group = pg_collection.dp_cp
+        dp_group = pg_collection.dp
+        tp_group = pg_collection.tp
+        pp_group = pg_collection.pp
+    assert (
+        dp_cp_group is not None
+        and dp_group is not None
+        and tp_group is not None
+        and pp_group is not None
+    ), "dp_cp_group, dp_group, tp_group must not be None when using sequence packing"
+
+    dev = torch.cuda.current_device()
+    dp_size = dp_group.size()
+    cp_size = dp_cp_group.size() // dp_size
+
+    # Look up the scheduler class by name
+    scheduler_type = config.sequence_packing_scheduler
+
+    scheduler_kwargs = {}
+    if scheduler_type == 'default_dynamic_cp':
+        scheduler_kwargs['min_cp_size'] = config.min_dynamic_context_parallel_size
+
+    scheduler = scheduler_map[scheduler_type](
+        config.max_seqlen_per_dp_cp_rank,
+        cp_size,
+        dp_size,
+        (
+            None
+            if config.virtual_pipeline_model_parallel_size is None
+            else config.microbatch_group_size_per_vp_stage
+        ),
+        **scheduler_kwargs,
+    )
+
+    (
+        new_data_iterator,
+        num_micro_batches,
+        seqlen_sum_this_global_batch,
+        seqlen_squared_sum_this_global_batch,
+    ) = scheduler.run(
+        data_iterator, num_microbatches, dp_group, tp_group, pp_group, dp_cp_group, dev, config
+    )
+
+    return (
+        new_data_iterator,
+        num_micro_batches,
+        seqlen_sum_this_global_batch,
+        seqlen_squared_sum_this_global_batch,
+    )
+
+
+def get_batch_on_this_rank_for_sequence_packing(
+    data_iterator,
+    vpp_size: Optional[int] = None,
+    mtp_on_this_rank: bool = False,
+    vp_stage: Optional[int] = None,
+    dynamic_cp: bool = False,
+    pg_collection: Optional[ProcessGroupCollection] = None,
+):
+    """
+    Get a batch of data for sequence packing.
+    Args:
+        data_iterator (Iterator): The data iterator to get the batch from.
+        mtp_on_this_rank (bool): Whether to use multi-token prediction.
+        vp_stage (Optional[int]): The stage of the pipeline.
+    Returns:
+        tuple of (tokens, labels, loss_mask, attention_mask, position_ids, packed_seq_params)
+    """
+
+    if pg_collection is None:
+        tp_group = parallel_state.get_tensor_model_parallel_group()
+        pp_group = parallel_state.get_pipeline_model_parallel_group()
+        cp_group = parallel_state.get_context_parallel_group()
+    else:
+        tp_group = pg_collection.tp
+        pp_group = pg_collection.pp
+        cp_group = pg_collection.cp
+
+    tp_src_rank = torch.distributed.get_process_group_ranks(tp_group)[0]
+
+    is_tp_rank_0 = tp_group.rank() == 0
+    is_first_stage = pp_group.rank() == 0 and (vp_stage is None or vp_stage == 0)
+    is_last_stage = pp_group.rank() == pp_group.size() - 1 and (
+        vp_stage is None or vp_stage == vpp_size - 1
+    )
+
+    is_first_or_last_stage = is_first_stage or is_last_stage
+    dev = torch.cuda.current_device()
+
+    # data_iterator should return a batch including the following keys.
+    batch_keys = ['cu_seqlens', 'cu_seqlens_padded', 'max_seqlen']
+    if dynamic_cp:
+        batch_keys.append('local_cp_size')
+    if is_first_stage or mtp_on_this_rank:
+        batch_keys.append('tokens')
+        batch_keys.append('position_ids')
+    if is_last_stage or mtp_on_this_rank:
+        batch_keys.append('labels')
+        batch_keys.append('loss_mask')
+
+    # Get a batch from data_iterator or create an emtpy batch.
+    if is_tp_rank_0:
+        assert data_iterator is not None
+        batch = next(data_iterator)
+        for key in batch_keys:
+            assert key in batch, f"{key} is missing in current batch."
+    else:
+        assert data_iterator is None, "Non TP 0 rank should not have data_iterator"
+        batch = {}
+
+    # For dynamic CP, determine the correct cp_group from batch on TP rank 0.
+    if dynamic_cp and is_tp_rank_0:
+        local_cp_size_val = batch['local_cp_size']
+        if isinstance(local_cp_size_val, torch.Tensor):
+            local_cp_size_val = local_cp_size_val.item()
+        cp_group = parallel_state.get_dynamic_data_context_parallel_groups(
+            group_size=local_cp_size_val
         )
-        return samples_this_rank_with_id, sample_id_groups
+
+    # Partition tokens, position_ids, labels, loss_mask for context parallel.
+    # Only TP rank 0 on stages that have data (first/last PP stage or MTP stage) needs this.
+    if is_tp_rank_0 and (is_first_or_last_stage or mtp_on_this_rank):
+        get_cp_slice_for_thd(batch, cp_group)
+
+    # Broadcast cu_seqlens_size because we need it to create placeholder for cu_seqlens and
+    # cu_seqlens_padded for non TP 0 ranks.
+    if is_tp_rank_0:
+        cu_seqlen_size = torch.tensor(batch['cu_seqlens'].size(0), dtype=torch.int32, device=dev)
+    else:
+        cu_seqlen_size = torch.empty(1, dtype=torch.int32, device=dev)
+    broadcast_tensor(cu_seqlen_size, tp_src_rank, tp_group)
+    cu_seqlen_size = cu_seqlen_size.item()
+
+    # Broadcast total_tokens because we need it to create placeholder for tokens, position_ids,
+    # labels, loss_mask for non TP 0 ranks. Only first stage, last stage,
+    # and stage with mtp need this.
+
+    if is_first_or_last_stage or mtp_on_this_rank:
+        if is_tp_rank_0:
+            total_tokens = torch.tensor(batch['tokens'].size(0), dtype=torch.int32, device=dev)
+        else:
+            total_tokens = torch.empty(1, dtype=torch.int32, device=dev)
+        broadcast_tensor(total_tokens, tp_src_rank, tp_group)
+        total_tokens = total_tokens.item()
+
+    # Step1: Prepare "tokens", "position_ids" for first stage and stage with mtp on all TP ranks.
+    if is_first_stage or mtp_on_this_rank:
+        if is_tp_rank_0:
+            assert batch['tokens'].dtype == torch.int64
+            assert batch['position_ids'].dtype == torch.int64
+            batch['tokens'] = batch['tokens'].view(1, total_tokens)
+            batch['position_ids'] = batch['position_ids'].view(1, total_tokens)
+        else:
+            batch['tokens'] = torch.empty([1, total_tokens], dtype=torch.int64, device=dev)
+            batch['position_ids'] = torch.empty([1, total_tokens], dtype=torch.int64, device=dev)
+    else:
+        # Non first stage rank doesn't need tokens and position_ids.
+        batch['tokens'] = None
+        batch['position_ids'] = None
+
+    # Step2: Prepare "labels", "loss_mask" for last stage and stage with mtp on all TP ranks.
+    if is_last_stage or mtp_on_this_rank:
+        if is_tp_rank_0:
+            assert batch['labels'].dtype == torch.int64
+            assert batch['loss_mask'].dtype == torch.float32
+            batch['labels'] = batch['labels'].view(1, total_tokens)
+            batch['loss_mask'] = batch['loss_mask'].view(1, total_tokens)
+        else:
+            batch['labels'] = torch.empty([1, total_tokens], dtype=torch.int64, device=dev)
+            batch['loss_mask'] = torch.empty([1, total_tokens], dtype=torch.float32, device=dev)
+    else:
+        # Non last stage rank doesn't need labels and loss_mask.
+        batch['labels'] = None
+        batch['loss_mask'] = None
+
+    # Step3: Prepare "cu_seqlens", "cu_seqlens_padded", "max_seqlen" on all ranks.
+    if is_tp_rank_0:
+        assert batch['cu_seqlens'].dtype == torch.int32
+        assert batch['cu_seqlens_padded'].dtype == torch.int32
+        assert batch['cu_seqlens'].dim() == 1
+        assert batch['cu_seqlens_padded'].dim() == 1
+        if type(batch['max_seqlen']) == int:
+            batch['max_seqlen'] = torch.tensor(batch['max_seqlen'], dtype=torch.int32, device=dev)
+        else:
+            assert batch['max_seqlen'].dtype == torch.int32
+            assert batch['max_seqlen'].numel() == 1
+    else:
+        batch['cu_seqlens'] = torch.empty([cu_seqlen_size], dtype=torch.int32, device=dev)
+        batch['cu_seqlens_padded'] = torch.empty([cu_seqlen_size], dtype=torch.int32, device=dev)
+        batch['max_seqlen'] = torch.empty(1, dtype=torch.int32, device=dev)
+
+    # Step4: Prepare "local_cp_size" if dynamic context parallel is enabled.
+    if dynamic_cp:
+        if is_tp_rank_0:
+            if type(batch['local_cp_size']) == int:
+                batch['local_cp_size'] = torch.tensor(
+                    batch['local_cp_size'], dtype=torch.int32, device=dev
+                )
+            else:
+                assert batch['local_cp_size'].dtype == torch.int32
+                assert batch['local_cp_size'].numel() == 1
+        else:
+            batch['local_cp_size'] = torch.empty(1, dtype=torch.int32, device=dev)
+    else:
+        batch['local_cp_size'] = None
+
+    # Broadcast batch inside TP group.
+    broadcast_tensor(batch['tokens'], tp_src_rank, tp_group)
+    broadcast_tensor(batch['position_ids'], tp_src_rank, tp_group)
+    broadcast_tensor(batch['labels'], tp_src_rank, tp_group)
+    broadcast_tensor(batch['loss_mask'], tp_src_rank, tp_group)
+    broadcast_tensor(batch['cu_seqlens'], tp_src_rank, tp_group)
+    broadcast_tensor(batch['cu_seqlens_padded'], tp_src_rank, tp_group)
+    broadcast_tensor(batch['max_seqlen'], tp_src_rank, tp_group)
+    broadcast_tensor(batch['local_cp_size'], tp_src_rank, tp_group)
+
+    # Extract the data from batch after broadcasting.
+    tokens = batch['tokens']
+    position_ids = batch['position_ids']
+    labels = batch['labels']
+    loss_mask = batch['loss_mask']
+    cu_seqlens = batch['cu_seqlens']
+    cu_seqlens_padded = batch['cu_seqlens_padded']
+    max_seqlen = batch['max_seqlen'].item()
+    local_cp_size = batch['local_cp_size'].item() if dynamic_cp else None
+    cp_group = (
+        parallel_state.get_dynamic_data_context_parallel_groups(group_size=local_cp_size)
+        if dynamic_cp
+        else None
+    )
+
+    # Transformer Engine has a bug of cu_seqlens, we must treat cu_seqlens_padded as cu_seqlens to
+    # get the correct result.
+    # TODO: Revert this workaround once TE fixes the issue.
+    packed_seq_params = PackedSeqParams(
+        qkv_format="thd",
+        cu_seqlens_q=cu_seqlens_padded,
+        cu_seqlens_kv=cu_seqlens_padded,
+        cu_seqlens_q_padded=cu_seqlens_padded,
+        cu_seqlens_kv_padded=cu_seqlens_padded,
+        max_seqlen_q=max_seqlen,
+        max_seqlen_kv=max_seqlen,
+        local_cp_size=local_cp_size,
+        cp_group=cp_group,
+    )
+
+    # "attention_mask" is not valid for sequence packing, so set it to None.
+    return tokens, labels, loss_mask, None, position_ids, packed_seq_params
diff --git a/megatron/core/datasets/data_schedule_utils.py b/megatron/core/datasets/data_schedule_utils.py
new file mode 100644
index 00000000000..c59b1742c0a
--- /dev/null
+++ b/megatron/core/datasets/data_schedule_utils.py
@@ -0,0 +1,1020 @@
+# Copyright (c) 2025 NVIDIA CORPORATION.  All rights reserved.
+
+from collections import deque
+from functools import lru_cache
+from math import ceil, log2
+from typing import Callable, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+
+from megatron.core.extensions.transformer_engine import get_thd_partitioned_indices
+from megatron.core.rerun_state_machine import RerunDataIterator
+
+
+def get_cp_slice_for_thd(batch, cp_group):
+    """Partition sequence data for context parallelism in THD format.
+
+    Uses TE's THD partitioned indices to split the packed sequence across CP ranks.
+    Only keys present in the batch are sliced.
+
+    Args:
+        batch: Dict with packed sequence data.
+        cp_group: Context parallel process group.
+    """
+    cp_size = cp_group.size()
+    if cp_size <= 1:
+        return
+    cp_rank = cp_group.rank()
+    total_tokens = batch['tokens'].size(0)
+    # Transformer Engine has a bug of cu_seqlens, we must treat cu_seqlens_padded as
+    # cu_seqlens to get the correct result.
+    # TODO: Revert this workaround once TE fixes the issue.
+    cu_seqlens = batch["cu_seqlens_padded"]
+    index = get_thd_partitioned_indices(cu_seqlens, total_tokens, cp_size, cp_rank)
+    for key in ['tokens', 'position_ids', 'labels', 'loss_mask']:
+        if key in batch:
+            batch[key] = batch[key].index_select(0, index)
+
+
+def _unpack_batch(batch: List[Dict[str, torch.Tensor]]) -> List[Dict[str, torch.Tensor]]:
+    """
+    Unpacks the packed samples into a list of sub-samples.
+    Since each sub-sample may be routed to different DPxCP ranks,
+    we unpack the sample here to avoid unnecessarily transferring
+    the entire packed sample.
+    """
+    batch_unpacked = []
+    dev = batch[0]["tokens"].device
+    original_seq_lens = []
+    padded_seq_lens = []
+    for sample in batch:
+        for key in sample.keys():
+            if len(sample[key].shape) == 2:
+                # squeeze the redundant batch dimension added by
+                # default collate_fn in pytorch dataloader
+                # we need a custom collate_fn for THD to avoid this
+                # current THD does not support micro_batch_size > 1 due to sft_dataset.py and
+                # data_loader in data_samples.py
+                sample[key] = sample[key].squeeze(0)
+        for sub_sample in range(sample["cu_seqlens"].shape[0] - 1):
+            sub_sample_dict = {}
+            start_idx = sample["cu_seqlens"][sub_sample]
+            end_idx = sample["cu_seqlens"][sub_sample + 1]
+            if end_idx - start_idx == 0:
+                continue
+            for key in ["tokens", "labels", "loss_mask", "position_ids"]:
+                sub_sample_dict[key] = sample[key][start_idx:end_idx]
+            # Since sft_dataset.py does not provide cu_seqlens_original,
+            # we assume original_seq_len equals padded_seq_len here.
+            # Ideally the dataset should define the pre-padding seq_len.
+            seq_len = (end_idx - start_idx).item()
+            original_seq_lens.append(seq_len)
+            padded_seq_lens.append(seq_len)
+            batch_unpacked.append(sub_sample_dict)
+
+    # Single H2D transfer for all seq lens
+    original_seq_lens_cuda = torch.tensor(original_seq_lens, device=dev)
+    padded_seq_lens_cuda = torch.tensor(padded_seq_lens, device=dev)
+    for i, sub_sample_dict in enumerate(batch_unpacked):
+        sub_sample_dict["original_seq_len"] = original_seq_lens_cuda[i : i + 1]
+        sub_sample_dict["padded_seq_len"] = padded_seq_lens_cuda[i : i + 1]
+
+    return batch_unpacked
+
+
+def _get_global_seqlens_and_ids(subsample_seqlens: torch.Tensor, dp_group):
+    """
+    Gathers the sequence lengths of all subsamples from all DP ranks and calculates global IDs.
+    """
+    # Collect the number of subsamples from all ranks
+    num_local_subsamples = subsample_seqlens.shape[0]
+    local_len = torch.tensor([num_local_subsamples], dtype=torch.int32).cuda()
+    dp_subsample_count = [torch.zeros_like(local_len) for _ in range(dp_group.size())]
+    torch.distributed.all_gather(dp_subsample_count, local_len, group=dp_group)
+
+    # Find the max number of subsamples across all ranks and pad subsample_seqlens to max length
+    dp_subsample_counts = torch.stack(dp_subsample_count, dim=0).cpu().view(-1)
+    max_sub_samples = int(dp_subsample_counts.max().item())
+
+    if num_local_subsamples < max_sub_samples:
+        subsample_seqlens_padded = torch.cat(
+            [
+                subsample_seqlens,
+                torch.zeros(max_sub_samples - num_local_subsamples, dtype=torch.int32).cuda(),
+            ],
+            dim=0,
+        )
+    else:
+        subsample_seqlens_padded = subsample_seqlens
+
+    # Gather the subsample_seqlens from all ranks
+    seqlens_gathered = [torch.empty_like(subsample_seqlens_padded) for _ in range(dp_group.size())]
+    torch.distributed.all_gather(seqlens_gathered, subsample_seqlens_padded, group=dp_group)
+
+    # Trim each seqlens_gathered to the length of the correct sample
+    for dp_rank, seqlen in enumerate(seqlens_gathered):
+        seqlens_gathered[dp_rank] = seqlen[: dp_subsample_counts[dp_rank]]
+
+    seqlens_gathered = torch.cat(seqlens_gathered, dim=0)
+    seqlens_gathered = seqlens_gathered.cpu().tolist()
+
+    # Calculate the offsets to assign unique global ID to each subsample.
+    csum = torch.cumsum(dp_subsample_counts, dim=0, dtype=torch.int32)
+    offsets = torch.cat([torch.zeros(1, dtype=torch.int32), csum], dim=0)
+
+    # Calculate global ID for each subsample
+    dp_rank = dp_group.rank()
+    global_ids = torch.arange(len(seqlens_gathered), dtype=torch.int32).cuda()
+
+    # Create a list of (global_id, seqlen) tuples for scheduling
+    global_id_seqlens = [(i, seqlens_gathered[i]) for i in range(len(global_ids))]
+
+    # Get the global IDs locally present on this rank
+    start_idx = offsets[dp_rank]
+    end_idx = offsets[dp_rank + 1]
+
+    global_ids_this_rank = global_ids[start_idx:end_idx]
+
+    return global_id_seqlens, global_ids_this_rank, offsets, seqlens_gathered
+
+
+def _pack_sequences(
+    samples: List,
+    padded_lengths: torch.Tensor,
+    original_lengths: torch.Tensor,
+    local_cp_size: Optional[torch.Tensor],
+    dev: torch.device,
+) -> Dict[str, torch.Tensor]:
+    """Pack multiple samples into a single packed sample."""
+
+    def _pack_tensors(tensors):
+        return torch.cat([t.reshape(-1) for t in tensors], dim=0)
+
+    tokens = _pack_tensors([sample["tokens"] for sample in samples])
+    labels = _pack_tensors([sample["labels"] for sample in samples])
+    loss_mask = _pack_tensors([sample["loss_mask"] for sample in samples])
+    position_ids = _pack_tensors([sample["position_ids"] for sample in samples])
+
+    new_sample = {}
+    new_sample["tokens"] = tokens
+    new_sample["labels"] = labels
+    new_sample["loss_mask"] = loss_mask
+    new_sample["position_ids"] = position_ids
+
+    padded_lengths = padded_lengths.to(device=dev, dtype=torch.int32, non_blocking=True).reshape(-1)
+    cu_seqlens_padded = torch.empty(padded_lengths.numel() + 1, device=dev, dtype=torch.int32)
+    cu_seqlens_padded[0] = 0
+    cu_seqlens_padded[1:] = torch.cumsum(padded_lengths, dim=0)
+    max_seqlen = torch.max(padded_lengths).to(dtype=torch.int32)
+
+    new_sample["cu_seqlens_padded"] = cu_seqlens_padded
+    new_sample["max_seqlen"] = max_seqlen
+
+    original_lengths = original_lengths.to(
+        device=dev, dtype=torch.int32, non_blocking=True
+    ).reshape(-1)
+    cu_seqlens = torch.empty(original_lengths.numel() + 1, device=dev, dtype=torch.int32)
+    cu_seqlens[0] = 0
+    cu_seqlens[1:] = torch.cumsum(original_lengths, dim=0).reshape(-1)
+    new_sample["cu_seqlens"] = cu_seqlens
+
+    if local_cp_size is not None:
+        new_sample["local_cp_size"] = local_cp_size
+
+    return new_sample
+
+
+def broadcast_tensor(item, src_rank, group) -> None:
+    """Broadcast a tensor from src_rank to all ranks in the group."""
+    if item is not None:
+        torch.distributed.broadcast(item, src_rank, group=group)
+
+
+def broadcast_to_pp_group(
+    new_samples,
+    num_micro_batches,
+    seqlen_sum_this_global_batch,
+    seqlen_squared_sum_this_global_batch,
+    pp_group,
+    dev,
+    is_dynamic_cp: bool = False,
+):
+    """
+    Broadcast num_micro_batches, seqlen_sum_this_global_batch,
+    seqlen_squared_sum_this_global_batch and metadata to middle PP stages.
+    Before this broadcast, the new_samples on middle PP stages are None,
+    after this broadcast, the new_samples on middle PP stages contain the metadata but
+    without tokens, labels, loss_mask, position_ids.
+    """
+
+    pp_src_rank = torch.distributed.get_process_group_ranks(pp_group)[0]
+
+    if pp_group.size() > 2:
+        if pp_group.rank() == 0:
+            tensor_list = [
+                torch.tensor(
+                    [
+                        num_micro_batches,
+                        seqlen_sum_this_global_batch,
+                        seqlen_squared_sum_this_global_batch,
+                    ],
+                    dtype=torch.float32,
+                ).cuda()
+            ]
+            for sample in new_samples:
+                tensor_list.append(sample["max_seqlen"].unsqueeze(0))
+
+            if is_dynamic_cp:
+                for sample in new_samples:
+                    tensor_list.append(sample["local_cp_size"].unsqueeze(0))
+
+            for sample in new_samples:
+                tensor_list.append(sample["cu_seqlens"])
+                tensor_list.append(sample["cu_seqlens_padded"])
+            info_to_broadcast = torch.cat(tensor_list, dim=0).to(device=dev, dtype=torch.float32)
+            info_length_tensor = torch.tensor(info_to_broadcast.shape[0], dtype=torch.int32).cuda()
+            broadcast_tensor(info_length_tensor, pp_src_rank, pp_group)
+            broadcast_tensor(info_to_broadcast, pp_src_rank, pp_group)
+        else:
+            info_length_tensor = torch.tensor(0, dtype=torch.int32).cuda()
+            broadcast_tensor(info_length_tensor, pp_src_rank, pp_group)
+            info_to_broadcast = torch.empty(info_length_tensor.item(), dtype=torch.float32).cuda()
+            broadcast_tensor(info_to_broadcast, pp_src_rank, pp_group)
+            if pp_group.rank() != pp_group.size() - 1:
+                # middle PP stages receive the broadcasted info and unpack it
+                info_numpy = info_to_broadcast.cpu().numpy()
+                num_micro_batches = int(info_numpy[0])
+                seqlen_sum_this_global_batch = info_numpy[1]
+                seqlen_squared_sum_this_global_batch = info_numpy[2]
+                max_seqlens = info_to_broadcast[3 : 3 + num_micro_batches]
+                local_cp_sizes = (
+                    info_to_broadcast[3 + num_micro_batches : 3 + 2 * num_micro_batches]
+                    if is_dynamic_cp
+                    else None
+                )
+                cu_seqlens_list = []
+                cu_seqlens_padded_list = []
+                # cu_seqlens always starts with 0, and the other metadata values
+                # (num_micro_batches, seqlen_sum, seqlen_squared_sum, max_seqlens)
+                # are always positive, so we can use 0 as the delimiter to locate
+                # the start of each cu_seqlens / cu_seqlens_padded tensor.
+                # This avoids an extra broadcast for the lengths of cu_seqlens.
+                indices = np.where(info_numpy == 0)[0]
+                for i in range(num_micro_batches):
+                    cu_seqlens_list.append(info_to_broadcast[indices[i * 2] : indices[i * 2 + 1]])
+                    if i == num_micro_batches - 1:
+                        cu_seqlens_padded_list.append(info_to_broadcast[indices[i * 2 + 1] :])
+                    else:
+                        cu_seqlens_padded_list.append(
+                            info_to_broadcast[indices[i * 2 + 1] : indices[i * 2 + 2]]
+                        )
+
+                new_samples = []
+                for i in range(num_micro_batches):
+                    new_sample = {}
+                    new_sample["max_seqlen"] = max_seqlens[i].to(torch.int32)
+                    new_sample["cu_seqlens"] = cu_seqlens_list[i].to(torch.int32)
+                    new_sample["cu_seqlens_padded"] = cu_seqlens_padded_list[i].to(torch.int32)
+                    if is_dynamic_cp:
+                        new_sample["local_cp_size"] = local_cp_sizes[i].to(torch.int32)
+                    new_samples.append(new_sample)
+
+    return (
+        new_samples,
+        num_micro_batches,
+        seqlen_sum_this_global_batch,
+        seqlen_squared_sum_this_global_batch,
+    )
+
+
+def broadcast_scalars(values: List, group, dev, dtype=torch.float32) -> List:
+    """
+    Broadcast scalar values from rank 0 to all ranks in the group.
+
+    Args:
+        values: List of scalar values to broadcast (only used on rank 0).
+        group: The process group to broadcast within.
+        dev: The device to use for the tensor.
+        dtype: The data type for the tensor.
+
+    Returns:
+        List of broadcasted values.
+    """
+    if group.size() <= 1:
+        return values
+
+    src_rank = torch.distributed.get_process_group_ranks(group)[0]
+    num_values = len(values)
+
+    if group.rank() == 0:
+        info_to_broadcast = torch.tensor(values, dtype=dtype, device=dev)
+    else:
+        info_to_broadcast = torch.zeros(num_values, dtype=dtype, device=dev)
+
+    broadcast_tensor(info_to_broadcast, src_rank, group)
+
+    if group.rank() != 0:
+        values = info_to_broadcast.cpu().tolist()
+
+    return values
+
+
+def create_data_iterator(
+    new_samples, tp_group, config, vpp_has_data=None, is_dynamic_cp: bool = False
+):
+    """Handle virtual pipeline parallelism.
+
+    For VPP, each PP rank needs a list of data iterators (one per VPP stage).
+    VPP stages that originally had a data_iterator (indicated by vpp_has_data)
+    get full samples; others get metadata only (cu_seqlens, cu_seqlens_padded,
+    max_seqlen).
+
+    Args:
+        new_samples: The packed samples after scheduling.
+        tp_group: Tensor parallel process group.
+        config: Model parallel config.
+        vpp_has_data: A list of booleans (one per VPP stage) indicating which
+            VPP stages originally had a data_iterator. None if VPP is disabled.
+    """
+    if (
+        config.virtual_pipeline_model_parallel_size is not None
+        and config.virtual_pipeline_model_parallel_size > 1
+    ):
+        vpp_size = config.virtual_pipeline_model_parallel_size
+        if tp_group.rank() == 0:
+            metadata_keys = ["max_seqlen", "cu_seqlens", "cu_seqlens_padded"]
+            if is_dynamic_cp:
+                metadata_keys.append("local_cp_size")
+            metadata = [
+                {k: sample[k] for k in metadata_keys if k in sample} for sample in new_samples
+            ]
+            new_data_iterator = []
+            for i in range(vpp_size):
+                if vpp_has_data is not None and vpp_has_data[i]:
+                    new_data_iterator.append(RerunDataIterator(iter(new_samples)))
+                else:
+                    new_data_iterator.append(RerunDataIterator(iter(metadata)))
+        else:
+            new_data_iterator = [None for _ in range(vpp_size)]
+    else:
+        new_data_iterator = RerunDataIterator(iter(new_samples)) if tp_group.rank() == 0 else None
+
+    return new_data_iterator
+
+
+def reroute_samples_to_dcp_ranks(
+    batch,
+    global_ids_this_rank,
+    global_id_seqlens,
+    sample_id_groups,
+    offsets,
+    dp_group,
+    tp_group,
+    dp_cp_group,
+    total_dcp_gpus,
+):
+    """
+    Reroutes the sub-samples to the correct rank after scheduling.
+
+    For each key in the batch dict, we perform an all-to-all communication
+    to transfer the data to the correct ranks.
+    """
+
+    def _gid_to_src_rank(gid: int) -> int:
+        dp_src_rank = torch.bucketize(gid, offsets[1:] - 1)
+        dcp_rank = (
+            torch.distributed.get_process_group_ranks(dp_group)[dp_src_rank] // tp_group.size()
+        ) % dp_cp_group.size()
+        return dcp_rank
+
+    gid2local_id = {int(gid): i for i, gid in enumerate(global_ids_this_rank)}
+    dcp_rank = dp_cp_group.rank()
+    dp_ranks = torch.distributed.get_process_group_ranks(dp_group)
+    dp_ranks = [(r // tp_group.size()) % dp_cp_group.size() for r in dp_ranks]
+
+    data_keys = batch[0].keys()
+
+    # Create the send plan
+    combined_sample_id_groups: List[List[int]] = [[] for _ in range(total_dcp_gpus)]
+    for d in range(total_dcp_gpus):
+        for sample_id_group in sample_id_groups:
+            combined_sample_id_groups[d].extend(sample_id_group[d])
+    for dest_rank in range(total_dcp_gpus):
+        combined_sample_id_groups[dest_rank].sort()
+
+    send_ids_sorted = [
+        gid for d in dp_ranks for gid in combined_sample_id_groups[d] if gid in global_ids_this_rank
+    ]
+
+    send_num_split = [0] * total_dcp_gpus
+    send_lens_split = [0] * total_dcp_gpus
+    for dest_rank in range(total_dcp_gpus):
+        if dest_rank in dp_ranks:
+            send_seq_lens = [
+                global_id_seqlens[gid][1]
+                for gid in combined_sample_id_groups[dest_rank]
+                if gid in global_ids_this_rank
+            ]
+            send_num_split[dest_rank] = len(send_seq_lens)
+            send_lens_split[dest_rank] = sum(send_seq_lens)
+        else:
+            send_lens_split[dest_rank] = 0
+
+    # Create the recv plan
+    recv_sample_id_groups = [[] for _ in range(total_dcp_gpus)]
+    for gid in combined_sample_id_groups[dcp_rank]:
+        src_rank = _gid_to_src_rank(gid)
+        recv_sample_id_groups[src_rank].append(gid)
+
+    recv_lens_split = [0] * total_dcp_gpus
+    for src_rank in range(total_dcp_gpus):
+        recv_lens_split[src_rank] = sum(
+            [global_id_seqlens[gid][1] for gid in recv_sample_id_groups[src_rank]]
+        )
+
+    recv_ids_sorted = [gid for d in range(total_dcp_gpus) for gid in recv_sample_id_groups[d]]
+    recv_counts = [len(recv_sample_id_groups[d]) for d in range(total_dcp_gpus)]
+
+    recv_samples = [{k: None for k in data_keys} for _ in range(sum(recv_counts))]
+
+    def _pack_sample_by_key(key: str) -> torch.Tensor:
+        flattened_tensors = []
+        for gid in send_ids_sorted:
+            t = batch[gid2local_id[gid]][key].to(torch.cuda.current_device(), non_blocking=True)
+            flattened_tensors.append(t.reshape(-1))
+        return (
+            torch.cat(flattened_tensors, dim=0)
+            if flattened_tensors
+            else torch.empty(0, device=torch.cuda.current_device(), dtype=batch[0][key].dtype)
+        )
+
+    def _unpack_sample_by_key(key: str, recv_tensor: torch.Tensor):
+        cursor = 0
+        for i, gid in enumerate(recv_ids_sorted):
+            sample_len = (
+                1 if key in ["original_seq_len", "padded_seq_len"] else global_id_seqlens[gid][1]
+            )
+            recv_samples[i][key] = recv_tensor[cursor : cursor + sample_len]
+            cursor += sample_len
+
+    for key in data_keys:
+        output_split_sizes, input_split_sizes = (
+            (recv_counts, send_num_split)
+            if key in ["original_seq_len", "padded_seq_len"]
+            else (recv_lens_split, send_lens_split)
+        )
+        send_tensor = _pack_sample_by_key(key)
+        recv_tensor_size = sum(output_split_sizes)
+        recv_tensor = torch.empty(
+            recv_tensor_size, device=torch.cuda.current_device(), dtype=send_tensor.dtype
+        )
+        torch.distributed.all_to_all_single(
+            output=recv_tensor,
+            input=send_tensor,
+            output_split_sizes=output_split_sizes,
+            input_split_sizes=input_split_sizes,
+            group=dp_cp_group,
+        )
+        _unpack_sample_by_key(key, recv_tensor)
+
+    recv_sample_with_id = {recv_id: recv_samples[i] for i, recv_id in enumerate(recv_ids_sorted)}
+    return recv_sample_with_id
+
+
+def build_packed_microbatches(
+    samples_this_rank_with_id: Dict[int, Dict[str, torch.Tensor]],
+    sample_id_groups: List[List[List[int]]],
+    dcp_rank: int,
+    dev: torch.device,
+    is_dynamic_cp: bool = False,
+) -> List[Dict[str, torch.Tensor]]:
+    """Build packed samples for each microbatch.
+
+    Args:
+        samples_this_rank_with_id: Mapping from global sample ID to sample dict,
+            as returned by reroute_samples_to_dcp_ranks.
+        sample_id_groups: Per-microbatch, per-rank lists of global sample IDs.
+        dcp_rank: This rank's index within the DP×CP group.
+        dev: Target device.
+        is_dynamic_cp: Whether dynamic context parallel is enabled.
+    """
+    num_micro_batches = len(sample_id_groups)
+    seg_starts: List[int] = [0]
+    original_lens_tensors = []
+    padded_lens_tensors = []
+
+    grouped_samples = [
+        [
+            samples_this_rank_with_id[sub_sample_id]
+            for sub_sample_id in sample_id_groups[i][dcp_rank]
+        ]
+        for i in range(num_micro_batches)
+    ]
+
+    local_cp_sizes_gpu = None
+    if is_dynamic_cp:
+        local_cp_sizes_cpu: List[int] = []
+        for i in range(num_micro_batches):
+            sample_ids_this_group = sample_id_groups[i][dcp_rank]
+            local_cp_sizes_cpu.append(
+                len(
+                    [
+                        1
+                        for sample_ids in sample_id_groups[i]
+                        if sample_ids_this_group[0] in sample_ids
+                    ]
+                )
+            )
+        local_cp_sizes_gpu = torch.tensor(local_cp_sizes_cpu, dtype=torch.int32, device=dev)
+
+    for i in range(num_micro_batches):
+        samples = grouped_samples[i]
+        seg_starts.append(seg_starts[-1] + len(samples))
+        original_lens_tensors.extend([s["original_seq_len"].reshape(-1) for s in samples])
+        padded_lens_tensors.extend([s["padded_seq_len"].reshape(-1) for s in samples])
+
+    padded_lens_all_gpu = torch.cat(padded_lens_tensors, dim=0).to(dtype=torch.int32)
+    original_lens_all_gpu = torch.cat(original_lens_tensors, dim=0).to(dtype=torch.int32)
+
+    new_samples: List[Dict[str, torch.Tensor]] = []
+    for i in range(num_micro_batches):
+        samples = grouped_samples[i]
+        lens_padded = padded_lens_all_gpu[seg_starts[i] : seg_starts[i + 1]]
+        lens_original = original_lens_all_gpu[seg_starts[i] : seg_starts[i + 1]]
+        local_cp_size = local_cp_sizes_gpu[i] if is_dynamic_cp else None
+        new_sample = _pack_sequences(samples, lens_padded, lens_original, local_cp_size, dev)
+        new_samples.append(new_sample)
+
+    return new_samples
+
+
+def get_batch_and_global_seqlens(data_iterator, num_microbatches, dp_group):
+    """
+    Get the batch and global sequence lengths.
+    Each DP rank loads the same number of sequences, so we need to gather the sequence
+    lengths from all ranks then we can schedule the sequences into groups.
+    Args:
+        data_iterator: The data iterator.
+        num_microbatches: The number of microbatches.
+        dp_group: The data parallel group.
+
+    Returns:
+        batch: The batch.
+        global_id_seqlens: The global sequence lengths.
+        global_ids_this_rank: The global IDs locally present on this rank.
+    """
+
+    batch_list = [next(data_iterator) for _ in range(num_microbatches)]
+
+    batch = []
+    for item in batch_list:
+        if isinstance(item, dict):
+            batch.append(item)
+        elif isinstance(item, list):
+            batch.extend(item)
+        else:
+            raise ValueError(f"Invalid item type: {type(item)}")
+
+    # in sft_dataset.py, sequences are already packed before rescheduling,
+    # so we need to unpack them here and repack after rescheduling.
+    # This is only to adapt to the current megatron-lm sft_dataset.
+    # If you implement your own dataset, just have __getitem__ return List[Dict]
+    # and this step can be skipped.
+    batch = _unpack_batch(batch)
+
+    subsample_seqlens = torch.cat([sample["padded_seq_len"] for sample in batch]).to(
+        dtype=torch.int32, device=torch.cuda.current_device()
+    )
+
+    global_id_seqlens, global_ids_this_rank, offsets, seqlens_gathered = (
+        _get_global_seqlens_and_ids(subsample_seqlens, dp_group)
+    )
+
+    return batch, global_id_seqlens, global_ids_this_rank, offsets, seqlens_gathered
+
+
+# =============================================================================
+# Dynamic CP scheduling algorithms (used by DefaultDynamicCPScheduler)
+# =============================================================================
+
+
+def next_hdp_group(
+    sample_seqlens: List[Tuple[int, int]],
+    compute_estimator: Callable[[int], float],
+    total_gpus: int,
+    gpus_needed_fn: Callable[[int], int],
+    make_buckets_equal_fn: Callable,
+    max_seq_len_per_rank: float,
+    get_total_workload_fn: Callable,
+    delta: float = 0.05,
+    strategy: str = "dp",
+    eps_bucket: float = 0.10,
+) -> Tuple[List[List[int]], List[Tuple[int, int]], List[float], List[List[int]]]:
+    """Form one balanced micro-batch group across DPxCP ranks.
+
+    This is a standalone version of the scheduling algorithm extracted from
+    DefaultDynamicCPScheduler so it can live in a utils module.
+
+    Extra args compared to the method version:
+        gpus_needed_fn: callable(seq_len) -> int
+        make_buckets_equal_fn: callable(sample_seqlens, compute_estimator) -> list[deque]
+        max_seq_len_per_rank: max tokens per rank for packing
+        get_total_workload_fn: callable(seq_len, cp_size) -> float
+    """
+    if not sample_seqlens:
+        return (
+            [[] for _ in range(total_gpus)],
+            [],
+            [0.0 for _ in range(total_gpus)],
+            [[] for _ in range(total_gpus)],
+        )
+
+    buckets = make_buckets_equal_fn(sample_seqlens, compute_estimator)
+
+    micro_batches = [[] for _ in range(total_gpus)]
+    exec_times = [0.0 for _ in range(total_gpus)]
+    sample_ids_per_gpu = [[] for _ in range(total_gpus)]
+    packing_sequence_len = {}
+
+    gpu_group_id = [None] * total_gpus
+    group_members = {}
+    group_size = {}
+    next_gid = 0
+
+    pp_cursor = 0
+    prev_needed = None
+    check_balance = False
+
+    while buckets:
+        sample_seq_tuple = bucket_idx = None
+        needed = None
+
+        scan_order = (
+            range(len(buckets))
+            if strategy == "dp"
+            else [(pp_cursor + i) % len(buckets) for i in range(len(buckets))]
+        )
+
+        for idx in scan_order:
+            if not buckets[idx]:
+                continue
+            cand_tuple = buckets[idx][0]
+            cand_seq_len = cand_tuple[1]
+            needed = gpus_needed_fn(cand_seq_len)
+
+            candidate_gids = [gid for gid, sz in group_size.items() if sz == needed]
+            free_ranks = [r for r, gid in enumerate(gpu_group_id) if gid is None]
+            if candidate_gids or len(free_ranks) >= needed:
+                sample_seq_tuple, bucket_idx = cand_tuple, idx
+                break
+
+        if sample_seq_tuple is None:
+            break
+
+        if strategy == "pp":
+            pp_cursor = (bucket_idx + 1) % len(buckets)
+
+        sample_id, seq_len = sample_seq_tuple
+        needed = gpus_needed_fn(seq_len)
+        if prev_needed is None:
+            prev_needed = needed
+
+        candidate_gids = [
+            gid
+            for gid, sz in group_size.items()
+            if sz == needed and packing_sequence_len[gid] + seq_len / needed <= max_seq_len_per_rank
+        ]
+        if candidate_gids:
+            best_gid, best_load = min(
+                ((gid, max(exec_times[r] for r in group_members[gid])) for gid in candidate_gids),
+                key=lambda t: t[1],
+            )
+        else:
+            best_gid, best_load = None, float("inf")
+
+        free_ranks = [r for r, gid in enumerate(gpu_group_id) if gid is None]
+        if len(free_ranks) >= needed:
+            free_sorted = sorted(free_ranks, key=lambda r: exec_times[r])
+            new_members = free_sorted[:needed]
+            new_load = exec_times[new_members[-1]]
+
+            if new_load < best_load:
+                best_gid = None
+                chosen_members = new_members
+            else:
+                chosen_members = group_members[best_gid]
+        else:
+            if best_gid is None:
+                break
+            chosen_members = group_members[best_gid]
+
+        if best_gid is None:
+            best_gid = next_gid
+            next_gid += 1
+            group_members[best_gid] = chosen_members
+            group_size[best_gid] = needed
+            for r in chosen_members:
+                gpu_group_id[r] = best_gid
+
+        per_gpu_cost = compute_estimator(seq_len)
+
+        packing_sequence_len[best_gid] = packing_sequence_len.get(best_gid, 0) + seq_len / needed
+        for r in chosen_members:
+            micro_batches[r].append(seq_len)
+            exec_times[r] += per_gpu_cost
+            sample_ids_per_gpu[r].append(sample_id)
+
+        buckets[bucket_idx].popleft()
+
+        while buckets and not buckets[0]:
+            buckets.pop(0)
+            pp_cursor %= max(1, len(buckets))
+
+        if needed < prev_needed:
+            check_balance = True
+
+        if (
+            check_balance
+            and buckets
+            and max(exec_times) - min(exec_times) <= delta * max(exec_times)
+        ):
+            break
+
+    leftovers = []
+    for b in buckets:
+        for sample_seq_tuple in b:
+            leftovers.append(sample_seq_tuple)
+
+    def trim_overload():
+        while True:
+            cur_max = max(exec_times)
+            cur_min = min(exec_times)
+            cur_slack = cur_max - cur_min
+            if cur_slack <= delta * cur_max:
+                break
+            if cur_min == 0:
+                break
+
+            max_r = exec_times.index(cur_max)
+            gid = gpu_group_id[max_r]
+            members = group_members[gid]
+
+            if not micro_batches[max_r] or len(micro_batches[max_r]) <= 1:
+                break
+
+            seq = micro_batches[max_r][-1]
+            per_gpu_cost = compute_estimator(seq)
+
+            proj_times = exec_times[:]
+            for r in members:
+                proj_times[r] -= per_gpu_cost
+
+            proj_slack = max(proj_times) - min(proj_times)
+
+            if proj_slack < cur_slack:
+                sample_id_to_remove = sample_ids_per_gpu[max_r][-1]
+                for r in members:
+                    micro_batches[r].pop()
+                    exec_times[r] -= per_gpu_cost
+                    sample_ids_per_gpu[r].pop()
+                leftovers.append((sample_id_to_remove, seq))
+            else:
+                break
+
+    # TODO(tailaim): uncomment this to support different ranks have different num_microbatches
+    # trim_overload()
+
+    total_work_before = sum(len(mb) for mb in micro_batches)
+
+    def fill_empty_gpus(micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size):
+        empty_gpus = [i for i in range(total_gpus) if not micro_batches[i]]
+        if not empty_gpus:
+            return (micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size)
+
+        existing_group_sizes = set(group_size.values())
+        assert (
+            existing_group_sizes
+        ), "There should be at least one group existing, cannot redistribute, "
+        "try to increase 'max-seqlen-per-dp-cp-rank'."
+
+        min_group_size = min(existing_group_sizes)
+        next_power = min(min_group_size * 2, total_gpus)
+
+        for gid, size in group_size.items():
+            if size == min_group_size:
+                members = group_members[gid]
+                needed_count = next_power - min_group_size
+                group_start_gpu = members[0]
+                group_end_gpu = members[-1]
+                empty_gpu = [idx for idx, work in enumerate(micro_batches) if not work][0]
+                assert not all(
+                    work for work in micro_batches[empty_gpu : empty_gpu + needed_count]
+                ), "Empty GPUs were detected but not enough to expand."
+                work_to_push = micro_batches[group_end_gpu + 1 : empty_gpu]
+                exec_times_to_push = exec_times[group_end_gpu + 1 : empty_gpu]
+                sample_ids_to_push = sample_ids_per_gpu[group_end_gpu + 1 : empty_gpu]
+
+                new_micro_batches = [[]] * len(micro_batches)
+                new_exec_times = [0.0] * len(exec_times)
+                new_sample_ids_per_gpu = [[]] * len(sample_ids_per_gpu)
+
+                for i in range(group_start_gpu):
+                    new_micro_batches[i] = micro_batches[i]
+                    new_exec_times[i] = exec_times[i]
+                    new_sample_ids_per_gpu[i] = sample_ids_per_gpu[i]
+
+                for i in range(group_start_gpu, group_end_gpu + needed_count + 1):
+                    new_micro_batches[i] = micro_batches[group_end_gpu]
+                    new_exec_times[i] = get_total_workload_fn(
+                        micro_batches[group_end_gpu][0], next_power
+                    )
+                    new_sample_ids_per_gpu[i] = sample_ids_per_gpu[group_end_gpu]
+
+                for i, work in enumerate(work_to_push):
+                    new_micro_batches[group_end_gpu + needed_count + 1 + i] = work
+                    new_exec_times[group_end_gpu + needed_count + 1 + i] = exec_times_to_push[i]
+                    new_sample_ids_per_gpu[group_end_gpu + needed_count + 1 + i] = (
+                        sample_ids_to_push[i]
+                    )
+
+                group_size[gid] = next_power
+                group_members[gid] = list(range(members[0], members[-1] + needed_count + 1))
+                for pushed_gid in group_size.keys():
+                    if pushed_gid > gid:
+                        group_members[pushed_gid] = [
+                            x + needed_count for x in group_members[pushed_gid]
+                        ]
+
+                return (
+                    new_micro_batches,
+                    new_exec_times,
+                    new_sample_ids_per_gpu,
+                    group_members,
+                    group_size,
+                )
+
+    empty_gpus = any([not micro_batches[i] for i in range(total_gpus)])
+    while empty_gpus:
+        micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size = fill_empty_gpus(
+            micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size
+        )
+        empty_gpus = any([not micro_batches[i] for i in range(total_gpus)])
+
+    total_work_after = sum(len(mb) for mb in micro_batches)
+    assert (
+        total_work_after >= total_work_before
+    ), f"Samples were removed: {total_work_before} -> {total_work_after}"
+
+    return micro_batches, leftovers, exec_times, sample_ids_per_gpu
+
+
+def align_sample_id_groups(sample_id_groups: List, microbatch_group_size_per_vp_stage: int) -> List:
+    """Align len(sample_id_groups) to microbatch_group_size_per_vp_stage when VPP is enabled.
+
+    Standalone version extracted from DefaultDynamicCPScheduler.
+    """
+    multiple = int(microbatch_group_size_per_vp_stage)
+    remainder = (-len(sample_id_groups)) % multiple
+    i = len(sample_id_groups) - 1
+
+    def split_group(sample_id_group):
+        total_hdp_ranks = len(sample_id_group)
+        cu_ranks = [0]
+        prev_cp_size = 0
+
+        while cu_ranks[-1] != total_hdp_ranks:
+            start_rank = cu_ranks[-1]
+            sid0 = sample_id_group[start_rank][0]
+            cp_size = 0
+            for r in range(start_rank, total_hdp_ranks):
+                if sid0 in sample_id_group[r]:
+                    cp_size += 1
+                else:
+                    break
+            assert (
+                prev_cp_size == 0 or cp_size <= prev_cp_size
+            ), f"split_group: CP size is not decreasing: prev={prev_cp_size}, cur={cp_size}"
+            cu_ranks.append(start_rank + cp_size)
+            prev_cp_size = cp_size
+        if len(cu_ranks) == 2:
+            return None, None
+
+        k = 0
+        while cu_ranks[k] < total_hdp_ranks // 2:
+            k += 1
+
+        old_mb = sample_id_group[: cu_ranks[k]] + [[] for _ in range(total_hdp_ranks - cu_ranks[k])]
+        new_mb = sample_id_group[cu_ranks[k] :] + [[] for _ in range(cu_ranks[k])]
+        old_mb = fill_empty_by_expanding_cp(old_mb)
+        new_mb = fill_empty_by_expanding_cp(new_mb)
+        return new_mb, old_mb
+
+    def fill_empty_by_expanding_cp(sample_id_group):
+        def fill_empty(sample_id_group):
+            empty_size = sum(1 for x in sample_id_group if len(x) == 0)
+            i = len(sample_id_group) - 1 - empty_size
+            prev_cp_size = 0
+            while i >= 0:
+                sid0 = sample_id_group[i][0]
+                cp_size = 0
+                while sid0 in sample_id_group[i] and i >= 0:
+                    cp_size += 1
+                    i -= 1
+                if cp_size > prev_cp_size and prev_cp_size != 0:
+                    start_idx = i + 1 + cp_size
+                    end_idx = -empty_size + prev_cp_size if -empty_size + prev_cp_size < 0 else None
+                    sample_id_group[start_idx + 2 * prev_cp_size : end_idx] = sample_id_group[
+                        start_idx + prev_cp_size : -empty_size
+                    ]
+                    sample_id_group[start_idx + prev_cp_size : start_idx + 2 * prev_cp_size] = (
+                        sample_id_group[start_idx : start_idx + prev_cp_size]
+                    )
+                    break
+                elif cp_size <= empty_size and i == -1:
+                    end_idx = -empty_size + cp_size if -empty_size + cp_size < 0 else None
+                    sample_id_group[2 * cp_size : end_idx] = sample_id_group[cp_size:-empty_size]
+                    sample_id_group[cp_size : 2 * cp_size] = sample_id_group[0:cp_size]
+                    break
+                prev_cp_size = cp_size
+            return sample_id_group
+
+        while len(sample_id_group[-1]) == 0:
+            sample_id_group = fill_empty(sample_id_group)
+        return sample_id_group
+
+    attempts_since_split = 0
+    while remainder > 0:
+        if i < 0:
+            if attempts_since_split >= len(sample_id_groups):
+                assert False, 'align_sample_id_groups: no tail microbatch has enough ids to split'
+            i = len(sample_id_groups) - 1
+        group1, group2 = split_group(sample_id_groups[i])
+        if group1 is not None and group2 is not None:
+            sample_id_groups[i] = group1
+            sample_id_groups.append(group2)
+            remainder -= 1
+            attempts_since_split = 0
+        else:
+            attempts_since_split += 1
+        i -= 1
+
+    return sample_id_groups
+
+
+# =============================================================================
+# Workload estimation helpers for dynamic CP scheduling
+# =============================================================================
+
+
+@lru_cache(maxsize=128)
+def dcp_gpus_needed(seq_len: int, max_seq_len_per_rank: int, min_cp_size: int = 1) -> int:
+    """Number of GPUs needed, rounded up to the next power of 2, lower-bounded by min_cp_size."""
+    raw = max(1, 2 ** ceil(log2(seq_len / max_seq_len_per_rank)))
+    return max(min_cp_size, raw)
+
+
+@lru_cache(maxsize=128)
+def dcp_get_total_workload(
+    seq_length: int, max_seq_len_per_rank: int, cp_size: Optional[int] = None, min_cp_size: int = 1
+) -> float:
+    """Estimate workload of a sub-sample for scheduling balance."""
+    if cp_size is None:
+        cp_size = dcp_gpus_needed(seq_length, max_seq_len_per_rank, min_cp_size)
+    return (seq_length * seq_length) / cp_size
+
+
+def dcp_make_buckets_equal(
+    sample_seqlens: List[Tuple[int, int]],
+    compute_estimator: Callable,
+    max_seq_len_per_rank: int,
+    min_cp_size: int = 1,
+) -> List[deque]:
+    """Split samples into buckets of roughly equal work, one per unique CP size."""
+    seqlens = [seq_len for _, seq_len in sample_seqlens]
+    k = len({dcp_gpus_needed(L, max_seq_len_per_rank, min_cp_size) for L in seqlens})
+
+    work = []
+    for _, s in sample_seqlens:
+        cp_size = dcp_gpus_needed(s, max_seq_len_per_rank, min_cp_size)
+        work.append(compute_estimator(s, cp_size))
+    total_work = sum(work)
+    target = total_work / k
+    buckets, cur, cur_work = [], [], 0.0
+    remaining_k = k
+
+    for i, (sample_id, seq_len) in enumerate(sample_seqlens):
+        w = compute_estimator(seq_len)
+        projected = cur_work + w
+        if cur and (
+            projected > target * 1.1 or len(sample_seqlens) - i <= remaining_k - len(buckets)
+        ):
+            buckets.append(deque(cur))
+            cur, cur_work = [], 0.0
+            remaining_k -= 1
+        cur.append((sample_id, seq_len))
+        cur_work += w
+
+    if cur:
+        buckets.append(deque(cur))
+    return buckets
diff --git a/megatron/core/datasets/gpt_dataset.py b/megatron/core/datasets/gpt_dataset.py
index 42146d1acd2..418a02719df 100644
--- a/megatron/core/datasets/gpt_dataset.py
+++ b/megatron/core/datasets/gpt_dataset.py
@@ -58,8 +58,8 @@ class GPTDatasetConfig(BlendedMegatronDatasetConfig):
     Set to 0 if sequence parallel is not enabled regardless of TP size.
     """
 
-    hybrid_context_parallel: bool = False
-    """Option to enable hybrid context parallelism. When setting this to True, 
+    dynamic_context_parallel: bool = False
+    """Option to enable dynamic context parallelism. When setting this to True, 
     each sample should be divisible by the data parallel size * context parallel size * 2.
     If sequence parallel is enabled, it should be divisible by the 
     data parallel size * context parallel size * sequence parallel size * 2.
@@ -76,6 +76,9 @@ class GPTDatasetConfig(BlendedMegatronDatasetConfig):
     context_parallel_size: Optional[int] = None
     """The size of the context parallel group. Needed for padding in packed sequences."""
 
+    sft_mock_dataset_config_json: Optional[str] = None
+    """This config provides the necessary information for the mock dataset."""
+
     def __post_init__(self) -> None:
         """Do asserts and set fields post init"""
         super().__post_init__()
diff --git a/megatron/core/datasets/readme.md b/megatron/core/datasets/readme.md
index 452bf24e4a2..a61c623d960 100644
--- a/megatron/core/datasets/readme.md
+++ b/megatron/core/datasets/readme.md
@@ -192,6 +192,68 @@ To query the `BlendedDataset` for the _k_-th sample we do the following
 
 To save time during initialization, each index is built/cached sequentially on one process rank and subsequently loaded in parallel on other process ranks. The cached indices are unique to a hash generated in the `BlendedDataset.__init__` function.
 
+## Packing Scheduler
+
+The packing scheduler re-schedules variable-length sequences across DP×CP ranks to improve GPU utilization. It is built around two modules: `data_schedule.py` (high-level logic and entry points) and `data_schedule_utils.py` (utility functions).
+
+### Call Hierarchy
+
+The scheduling pipeline has two phases connected by the data iterator: `wrap_data_iterator` consumes the **original** data iterator, performs global-batch scheduling, and produces a **wrapped** (packed) data iterator; `get_batch_on_this_rank_for_sequence_packing` then consumes this **wrapped** data iterator to fetch individual packed microbatches during training.
+
+```
+                          original                              wrapped (packed)
+                       data_iterator                             data_iterator
+                            │                                        │
+                            ▼                                        ▼
+               ┌────────────────────────┐               ┌────────────────────────────────────┐
+               │  wrap_data_iterator()  │               │ get_batch_on_this_rank_for_        │
+Phase 1        │  (once per global      │   ────────►   │       sequence_packing()            │  Phase 2
+(scheduling)   │       batch)           │   returns     │  (once per microbatch,              │  (fetching)
+               │                        │   wrapped     │   called by training loop)          │
+               └───────────┬────────────┘   iterator    └──────────────┬─────────────────────┘
+                           │                                           │
+                           ▼                                           ▼
+          DpBalancedScheduler.run()                   next(wrapped_data_iterator)
+          │                                           ├─ get_thd_partitioned_indices()  [TE]
+          ├─ get_batch_and_global_seqlens()  [utils]  ├─ broadcast_tensor()             [utils]
+          ├─ get_groups_and_subsamples()              └─ PackedSeqParams(...)
+          ├─ reroute_samples_to_dcp_ranks()  [utils]
+          ├─ build_packed_microbatches()     [utils]
+          ├─ broadcast_to_pp_group()         [utils]
+          ├─ broadcast_scalars()             [utils]
+          └─ create_data_iterator()          [utils]
+```
+
+### `data_schedule.py`
+
+#### Entry Points
+
+- **`wrap_data_iterator(original_data_iterator) → wrapped_data_iterator`** — Top-level entry point called once per global batch. Takes the **original** data iterator as input, resolves the scheduler class from `scheduler_map`, instantiates it, and delegates to `scheduler.run()` which consumes all microbatches from the original iterator, re-schedules them, and produces a **wrapped** (packed) data iterator along with the updated `num_microbatches` and FLOPs statistics.
+
+- **`get_batch_on_this_rank_for_sequence_packing(wrapped_data_iterator)`** — Per-microbatch entry point called by the training loop. Takes the **wrapped** data iterator returned by `wrap_data_iterator` as input. Fetches one packed microbatch via `next(wrapped_data_iterator)`, broadcasts batch fields across TP ranks, optionally partitions sequences across CP ranks using Transformer Engine's `thd_get_partitioned_indices`, and constructs `PackedSeqParams` (with `cu_seqlens`, `max_seqlen`, `qkv_format=thd`).
+
+#### Scheduler Classes
+
+- **`BasePackingScheduler`** — Abstract base class. Defines the interface:
+  - `get_groups_and_subsamples()` — pure scheduling algorithm (must be overridden).
+  - `run()` — full pipeline: fetch → schedule → reroute → pack → broadcast → VPP handling.
+
+- **`DpBalancedScheduler(BasePackingScheduler)`** — Concrete scheduler that packs sequences in their original order until reaching `max_seqlen_per_dp_cp_rank × cp_size`. Aligns the number of microbatches to `dp_size` (and VPP stage multiples when applicable).
+
+### `data_schedule_utils.py`
+
+Utility functions consumed by the schedulers above:
+
+| Function | Role |
+|---|---|
+| `get_batch_and_global_seqlens()` | Fetch `num_microbatches` batches from the data iterator and all-gather sequence lengths across DP ranks. |
+| `reroute_samples_to_dcp_ranks()` | All-to-all communication to transfer sub-samples to their scheduled DP×CP rank. |
+| `build_packed_microbatches()` | Concatenate sub-samples within each microbatch group and produce `cu_seqlens`. |
+| `broadcast_to_pp_group()` | Broadcast packed samples and metadata from the first/last PP stage to middle stages. |
+| `broadcast_scalars()` | Broadcast scalar values (e.g. `num_microbatches`, FLOPs stats) across a process group. |
+| `broadcast_tensor()` | Broadcast a single tensor within a process group. |
+| `create_data_iterator()` | Wrap packed sample lists into a data iterator; handles VPP stage splitting. |
+
 ## Fast DataLoader initialization
 
 Especially for large-scale runs, DataLoader initialization can take several minutes, since it involves opening and memory-mapping multiple files and can significantly stress the filesystem. To speed up this process, we have developed the following three optimizations, controlled by configuration flags":
diff --git a/megatron/core/distributed/finalize_model_grads.py b/megatron/core/distributed/finalize_model_grads.py
index ca6bdd354ce..540dbbd51c5 100644
--- a/megatron/core/distributed/finalize_model_grads.py
+++ b/megatron/core/distributed/finalize_model_grads.py
@@ -281,7 +281,11 @@ def reset_model_temporary_tensors(config: TransformerConfig, model: List[torch.n
     """
     for model_chunk in model:
         for module in get_attr_wrapped_model(model_chunk, 'modules')():
-            if config.moe_router_enable_expert_bias and hasattr(module, 'expert_bias'):
+            if (
+                config.moe_router_enable_expert_bias
+                and hasattr(module, 'expert_bias')
+                and module.expert_bias is not None
+            ):
                 module.local_tokens_per_expert.zero_()
             if (
                 config.moe_router_load_balancing_type == "global_aux_loss"
@@ -303,7 +307,11 @@ def _update_router_expert_bias(model: List[torch.nn.Module], config: Transformer
             # cases where only the student is in training mode but the teacher is in eval mode
             # when using online knoweldge-distillation with Model-Optimizer. In this case, we want
             # to avoid updating teacher's expert_bias.
-            if hasattr(module, 'expert_bias') and module.training:
+            if (
+                hasattr(module, 'expert_bias')
+                and module.training
+                and module.expert_bias is not None
+            ):
                 tokens_per_expert_list.append(module.local_tokens_per_expert)
                 expert_bias_list.append(module.expert_bias)
     # For hybrid models with both MoE and Dense layers, this list can be empty.
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py
index 935508a57ab..878929715ae 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/mixed_precision.py
@@ -58,7 +58,6 @@
         QUANTIZED_MODEL_INIT_CLASS = fp8_model_init
 else:
     QUANTIZED_MODEL_INIT_CLASS = nullcontext
-
 # Detect the FP8 tensor class
 try:
     from transformer_engine.pytorch.tensor import QuantizedTensor
diff --git a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
index 960acc25ef6..031b56a5c26 100644
--- a/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
+++ b/megatron/core/distributed/fsdp/src/megatron_fsdp/param_and_grad_buffer.py
@@ -3100,7 +3100,6 @@ def _batch_quantize_blockwise_fp8_params(
                 if is_blockwise_float8tensor(param):
                     fp8_params.append(param)
                     if model_param.numel() == 0:
-                        # Empty parameter.
                         shard_fp32_from_fp8.append(None)
                         shard_offsets_in_fp8.append(None)
                         shard_model_params.append([None, None])
diff --git a/megatron/core/distributed/param_and_grad_buffer.py b/megatron/core/distributed/param_and_grad_buffer.py
index fe0a51b86b7..bcfa4c886e0 100644
--- a/megatron/core/distributed/param_and_grad_buffer.py
+++ b/megatron/core/distributed/param_and_grad_buffer.py
@@ -243,6 +243,9 @@ def __init__(
         # or bucket.grad_data.
         self.cached_param_buffer_shard_list = [None] * len(self.buckets)
         self.cached_grad_buffer_shard_list = [None] * len(self.buckets)
+        # Track grad mode used to create cached param views. Rebuild if mode changes to avoid
+        # mixing no_grad-created views with in-place updates in grad-enabled mode.
+        self._cached_param_buffer_shards_grad_enabled = None
 
     def reset(self):
         """
@@ -396,6 +399,7 @@ def start_param_sync(self, force_sync: bool = False):
                             model_p.data.copy_(updated_p)
                     bucket.layerwise_gather_list = None
                 self.param_gather_handle = None
+
         else:
             # Standard distributed optimizer path: use _coalescing_manager.
             # all_gather_into_tensor writes directly into a contiguous output buffer and
diff --git a/megatron/core/extensions/transformer_engine.py b/megatron/core/extensions/transformer_engine.py
index 419c85b404d..2a82a1e1cf2 100644
--- a/megatron/core/extensions/transformer_engine.py
+++ b/megatron/core/extensions/transformer_engine.py
@@ -1559,21 +1559,22 @@ def forward(
         """Forward."""
         if packed_seq_params is not None:
             # If Dynamic CP group is provided, update TE DPA CP group
-            if packed_seq_params.cp_group is not None:
-                self.cp_group = packed_seq_params.cp_group
-                super().set_context_parallel_group(
-                    self.cp_group,
-                    torch.distributed.get_process_group_ranks(self.cp_group),
-                    TEDotProductAttention.cp_stream,
-                    self.cp_comm_type,
-                )
-            # If cp_group is None but local_cp_size is provided,
-            # Indicates to turn off CP dynamically
-            elif packed_seq_params.local_cp_size is not None:
-                assert (
-                    packed_seq_params.local_cp_size == 1
-                ), "local_cp_size must be == 1 if provided without cp_group"
-                super().set_context_parallel_group(None, None, None, self.cp_comm_type)
+            if packed_seq_params.local_cp_size is not None:
+                if packed_seq_params.local_cp_size == 1:
+                    super().set_context_parallel_group(None, None, None, self.cp_comm_type)
+                else:
+                    assert (
+                        packed_seq_params.cp_group is not None
+                    ), "cp_group is not set in packed_seq_params for dynamic CP"
+                    self.cp_group = packed_seq_params.cp_group
+                    if TEDotProductAttention.cp_stream is None:
+                        TEDotProductAttention.cp_stream = torch.cuda.Stream()
+                    super().set_context_parallel_group(
+                        self.cp_group,
+                        torch.distributed.get_process_group_ranks(self.cp_group),
+                        TEDotProductAttention.cp_stream,
+                        self.cp_comm_type,
+                    )
             self.kept_packed_seq_params.discard("cp_group")
             self.kept_packed_seq_params.discard("local_cp_size")
 
@@ -1755,6 +1756,14 @@ def __init__(
                 tp_size = 1
                 tp_group_for_te = None
 
+            if is_te_min_version("2.14.0"):
+                extra_kwargs["single_grouped_weight"] = getattr(
+                    config, "moe_single_grouped_weight", False
+                )
+                extra_kwargs["single_grouped_bias"] = getattr(
+                    config, "moe_single_grouped_bias", False
+                )
+
             super().__init__(
                 num_gemms=num_gemms,
                 in_features=input_size,
@@ -1776,6 +1785,50 @@ def __init__(
             for param in self.parameters():
                 setattr(param, "allreduce", not (is_expert and self.expert_parallel))
 
+            def normalize_grouped_parameter_keys(
+                self,
+                state_dict,
+                prefix,
+                local_metadata,
+                strict,
+                missing_keys,
+                unexpected_keys,
+                error_msgs,
+            ):
+                """Make grouped checkpoint keys compatible across parameter layouts."""
+
+                def maybe_remap_param(param_name: str, single_grouped: bool) -> None:
+                    grouped_key = f"{prefix}{param_name}"
+                    indexed_keys = [
+                        f"{prefix}{param_name}{gemm_idx}" for gemm_idx in range(self.num_gemms)
+                    ]
+                    has_grouped_key = grouped_key in state_dict
+                    has_any_indexed_key = any(key in state_dict for key in indexed_keys)
+                    has_all_indexed_keys = all(key in state_dict for key in indexed_keys)
+
+                    if single_grouped:
+                        if has_grouped_key or not has_all_indexed_keys:
+                            return
+                        state_dict[grouped_key] = torch.stack(
+                            [state_dict.pop(key) for key in indexed_keys], dim=0
+                        )
+                    else:
+                        if has_any_indexed_key or not has_grouped_key:
+                            return
+                        split_tensors = self._split_grouped_checkpoint_tensor(
+                            state_dict.pop(grouped_key), grouped_key
+                        )
+                        for gemm_idx, tensor in enumerate(split_tensors):
+                            state_dict[f"{prefix}{param_name}{gemm_idx}"] = tensor
+
+                maybe_remap_param("weight", getattr(self, "single_grouped_weight", False))
+                if self.use_bias:
+                    maybe_remap_param("bias", getattr(self, "single_grouped_bias", False))
+
+            self._register_load_state_dict_pre_hook(
+                normalize_grouped_parameter_keys, with_module=True
+            )
+
             # Explicitly stamp partition_dim and partition_stride on expert weight
             # tensors when explicit_expert_comm cleared parallel_mode.  TE ≤2.12
             # set these internally; TE ≥2.13 no longer does (parallel_mode=None
@@ -1881,6 +1934,31 @@ def merge_extra_states(
 
             self._register_load_state_dict_pre_hook(merge_extra_states, with_module=True)
 
+        def _split_grouped_checkpoint_tensor(
+            self, tensor: torch.Tensor, checkpoint_key: str
+        ) -> list[torch.Tensor]:
+            """Split grouped checkpoint tensor into one tensor per GEMM."""
+            if hasattr(tensor, "split_into_quantized_tensors") and callable(
+                tensor.split_into_quantized_tensors
+            ):
+                grouped_tensors = getattr(tensor, "quantized_tensors", None)
+                if grouped_tensors is None:
+                    grouped_tensors = tensor.split_into_quantized_tensors()
+                if len(grouped_tensors) != self.num_gemms:
+                    raise RuntimeError(
+                        f"Grouped checkpoint tensor {checkpoint_key} has {len(grouped_tensors)} "
+                        f"groups, expected {self.num_gemms}."
+                    )
+                return list(grouped_tensors)
+            if tensor.ndim > 0 and tensor.shape[0] == self.num_gemms:
+                return list(tensor.unbind(dim=0))
+            if tensor.ndim > 0 and tensor.shape[0] % self.num_gemms == 0:
+                return list(torch.chunk(tensor, self.num_gemms, dim=0))
+            raise RuntimeError(
+                f"Cannot split checkpoint tensor {checkpoint_key} with shape {tuple(tensor.shape)} "
+                f"into {self.num_gemms} GEMM shards."
+            )
+
         def finish_init(self, quantization_config: QuantizationConfig):
             """Post-init of quantization override"""
             if quantization_config is None:
@@ -1985,6 +2063,21 @@ def _sharded_state_dict_grouped(
             singleton_local_shards = (metadata or {}).get('singleton_local_shards', False)
             sharded_state_dict = {}
             full_state_dict = self.state_dict(prefix="", keep_vars=True)
+            grouped_split_cache = {}
+
+            def get_gemm_tensor(param_name: str, gemm_idx: int) -> torch.Tensor:
+                indexed_name = f"{param_name}{gemm_idx}"
+                if indexed_name in full_state_dict:
+                    return full_state_dict[indexed_name]
+                if param_name not in full_state_dict:
+                    raise KeyError(indexed_name)
+                if param_name not in grouped_split_cache:
+                    grouped_split_cache[param_name] = self._split_grouped_checkpoint_tensor(
+                        full_state_dict[param_name], param_name
+                    )
+                grouped_splits = grouped_split_cache[param_name]
+                return grouped_splits[gemm_idx]
+
             num_global_experts = get_pg_size(self._pg_collection.ep) * self.num_gemms
             local_expert_indices_offset = get_pg_rank(self._pg_collection.ep) * self.num_gemms
             ep_axis = len(sharded_offsets)
@@ -1992,11 +2085,11 @@ def _sharded_state_dict_grouped(
             for gemm_idx in range(self.num_gemms):
                 global_expert_idx = local_expert_indices_offset + gemm_idx
                 state_dict = {
-                    f"{gemm_idx}.weight": full_state_dict[f"weight{gemm_idx}"],
+                    f"{gemm_idx}.weight": get_gemm_tensor("weight", gemm_idx),
                     f"{gemm_idx}._extra_state": extra_states[gemm_idx],
                 }
                 if self.use_bias:
-                    state_dict[f"{gemm_idx}.bias"] = full_state_dict[f"bias{gemm_idx}"]
+                    state_dict[f"{gemm_idx}.bias"] = get_gemm_tensor("bias", gemm_idx)
                 if singleton_local_shards:
                     expert_prefix = f"{global_expert_idx}.{prefix}"
                     new_sharded_offsets = sharded_offsets
@@ -2432,8 +2525,193 @@ def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Option
 
             return out, bias
 
+    class TEFusedDenseMLP(TEFusedMLP):
+        """Dense MLP using GroupedLinear(num_groups=1) to trigger
+        ForwardGroupedMLP_CuTeGEMMSwiGLU_MXFP8 fusion on SM100+ with MXFP8 recipe.
+
+        Subclass of TEFusedMLP -> does not modify TEFusedMLP or TEGroupedMLP.
+        The fused kernel fires automatically via the TE op fuser when it detects
+        the GroupedLinear -> ScaledSwiGLU -> GroupedLinear pattern with MXFP8 recipe.
+        """
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self._norm_seq: Optional[Tuple[te.pytorch.ops.Sequential]] = None
+            if not is_te_min_version("2.14.0"):
+                raise RuntimeError(
+                    f"{self.__class__.__name__} requires Transformer Engine >= 2.14.0 "
+                    "(needs pytorch.ops.GroupedLinear and pytorch.ops.ScaledSwiGLU)"
+                )
+            if self.config.add_bias_linear:
+                raise ValueError(
+                    f"{self.__class__.__name__} does not support add_bias_linear=True; "
+                    "the CuTeGEMM fused kernel requires bias-free linear layers."
+                )
+            if self.config.activation_func != F.silu or not self.config.gated_linear_unit:
+                raise ValueError(
+                    f"{self.__class__.__name__} requires SwiGLU activation "
+                    "(activation_func=F.silu, gated_linear_unit=True) "
+                    "for the CuTeGEMM fused kernel, but got "
+                    f"activation_func={self.config.activation_func}, "
+                    f"gated_linear_unit={self.config.gated_linear_unit}."
+                )
+
+        def _make_fused_impl(self) -> te.pytorch.ops.Sequential:
+            """Construct fused module with GroupedLinear(num_groups=1) + ScaledSwiGLU."""
+
+            fused_impl = te.pytorch.ops.Sequential()
+
+            # Tensor parallelism configuration
+            tp_world_size = get_tensor_model_parallel_world_size()
+            tp_group = None
+            if tp_world_size > 1:
+                tp_group = get_tensor_model_parallel_group()
+
+            # RNG state
+            rng_state_tracker_function = None
+            if get_cuda_rng_tracker().is_initialized():
+                rng_state_tracker_function = get_cuda_rng_tracker
+
+            # Check submodule types (same as TEFusedMLP)
+            if not isinstance(self.linear_fc1, te.pytorch.LayerNormLinear):
+                raise ValueError(
+                    f"{self.__class__.__name__} expects FC1 to be "
+                    "Transformer Engine LayerNormLinear, but found "
+                    f"{self.linear_fc1.__class__.__name__}."
+                )
+            if not isinstance(self.linear_fc2, te.pytorch.Linear):
+                raise ValueError(
+                    f"{self.__class__.__name__} expects FC2 to be "
+                    "Transformer Engine Linear, but found "
+                    f"{self.linear_fc2.__class__.__name__}."
+                )
+
+            # Norm op (same as TEFusedMLP)
+            norm_type = self.linear_fc1.normalization
+            norm_shape = self.linear_fc1.weight.size(1)
+            kwargs = {
+                "eps": self.linear_fc1.eps,
+                "device": "meta",
+                "dtype": self.linear_fc1.layer_norm_weight.dtype,
+                "zero_centered_gamma": self.linear_fc1.zero_centered_gamma,
+            }
+            op = None
+            if norm_type == "LayerNorm":
+                op = te.pytorch.ops.LayerNorm(norm_shape, **kwargs)
+                op.weight = self.linear_fc1.layer_norm_weight
+                op.bias = self.linear_fc1.layer_norm_bias
+            elif norm_type == "RMSNorm":
+                op = te.pytorch.ops.RMSNorm(norm_shape, **kwargs)
+                op.weight = self.linear_fc1.layer_norm_weight
+            else:
+                raise ValueError(f"Unsupported normalization ({norm_type})")
+            # Store norm in a separate Sequential applied OUTSIDE the MXFP8 autocast
+            # in forward(). Running norm inside MXFP8 context corrupts the saved rstd
+            # used in RMSNorm backward, causing gradient amplification up to 10^6.
+            # Wrapped in tuple to avoid nn.Module submodule registration (which would
+            # duplicate the shared norm weight in state_dict/parameters).
+            norm_seq = te.pytorch.ops.Sequential()
+            norm_seq.append(op)
+            self._norm_seq = (norm_seq,)
+
+            # GLU interleave size must match ScaledSwiGLU and the CuTe kernel.
+            _GLU_INTERLEAVE_SIZE = 32
+
+            # FC1: GroupedLinear(num_groups=1) instead of BasicLinear
+            weight = self.linear_fc1.weight
+            op = te.pytorch.ops.GroupedLinear(
+                num_groups=1,
+                in_features=weight.size(1),
+                out_features=weight.size(0) * tp_world_size,
+                device="meta",
+                dtype=weight.dtype,
+                bias=False,
+                rng_state_tracker_function=rng_state_tracker_function,
+                accumulate_into_main_grad=self.linear_fc1.fuse_wgrad_accumulation,
+            )
+            op.weight0 = weight
+            op._glu_interleave_size = _GLU_INTERLEAVE_SIZE  # signals fuser_forward to interleave
+            fused_impl.append(op)
+
+            # ScaledSwiGLU with glu_interleave_size=32
+            # Required by ForwardGroupedMLP_CuTeGEMMSwiGLU_MXFP8
+            fused_impl.append(te.pytorch.ops.ScaledSwiGLU(glu_interleave_size=32))
+
+            # FC2: GroupedLinear(num_groups=1) instead of BasicLinear
+            weight = self.linear_fc2.weight
+            op = te.pytorch.ops.GroupedLinear(
+                num_groups=1,
+                in_features=weight.size(1),
+                out_features=weight.size(0),
+                device="meta",
+                dtype=weight.dtype,
+                bias=False,
+                rng_state_tracker_function=rng_state_tracker_function,
+                accumulate_into_main_grad=self.linear_fc2.fuse_wgrad_accumulation,
+            )
+            op.weight0 = weight
+            # FC2 has no SwiGLU — MXFP8 quantization done on-the-fly in fuser_forward.
+            # No _mxfp8_weight0 pre-computation to avoid ~28 GB persistent FP8 tensors.
+            fused_impl.append(op)
+
+            if tp_world_size > 1:
+                if self.linear_fc2.sequence_parallel:
+                    fused_impl.append(te.pytorch.ops.ReduceScatter(tp_group))
+                else:
+                    fused_impl.append(te.pytorch.ops.AllReduce(tp_group))
+
+            self._register_hooks_on_fused_impl(fused_impl)
+            return fused_impl
+
+        def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[Tensor, Optional[Tensor]]:
+            """Forward pass using GroupedLinear(num_groups=1) + ScaledSwiGLU."""
+
+            orig_shape = hidden_states.shape
+            hidden_size = hidden_states.size(-1)
+            hidden_states_2d = hidden_states.view(-1, hidden_size)
+            total_tokens = hidden_states_2d.size(0)
+
+            tokens_per_expert = torch.full(
+                (1,), total_tokens, dtype=torch.long, device=hidden_states.device
+            )
+            scales = torch.ones(
+                total_tokens, device=hidden_states.device, dtype=hidden_states.dtype
+            )
+
+            # Build fused impl and cache recipe lazily on first forward pass.
+            # Both are created once and reused — avoids object creation every call.
+            if not hasattr(self, '_recipe'):
+                if os.getenv("FP4_RECIPE", "") == "nvfp4":
+                    self._recipe = te.common.recipe.NVFP4BlockScaling()
+                else:
+                    self._recipe = te.common.recipe.MXFP8BlockScaling()
+            recipe = self._recipe
+
+            if self._fused_impl is None:
+                with te.pytorch.fp8_autocast(enabled=True, fp8_recipe=recipe):
+                    self._fused_impl = (self._make_fused_impl(),)
+
+            # Apply norm in BF16 OUTSIDE the MXFP8 autocast to preserve the rstd
+            # tensor used by RMSNorm backward (running it inside causes up to 10^6
+            # gradient amplification, and causes convergence issues).
+            normed = self._norm_seq[0](hidden_states_2d)
+
+            with te.pytorch.fp8_autocast(enabled=True, fp8_recipe=recipe):
+                out = self._fused_impl[0](normed, tokens_per_expert, scales, tokens_per_expert)
+
+            out = out.view(*orig_shape[:-1], out.size(-1))
+
+            bias = None
+            if self.linear_fc2.te_return_bias:
+                bias = self.linear_fc2.bias
+                if isinstance(bias, torch.Tensor) and bias.numel() == 0:
+                    bias = None
+
+            return out, bias
+
 else:
     TEFusedMLP = None  # type: ignore[assignment, misc]
+    TEFusedDenseMLP = None  # type: ignore[assignment, misc]
 
 
 class TEDelayedScaling(te.common.recipe.DelayedScaling):
@@ -2820,3 +3098,24 @@ def set_save_original_input(module):
     from transformer_engine.pytorch.float8_tensor import Float8Tensor
 except ImportError:
     Float8Tensor = None
+
+
+def get_thd_partitioned_indices(cu_seqlens, total_tokens, cp_size, cp_rank):
+    """Get partitioned indices for THD format data in context parallel.
+
+    Args:
+        cu_seqlens: Cumulative sequence lengths tensor.
+        total_tokens: Total number of tokens.
+        cp_size: Context parallel world size.
+        cp_rank: Context parallel rank.
+
+    Returns:
+        Partitioned indices tensor.
+    """
+    assert is_te_min_version("1.10.0"), (
+        "Please update Transformer Engine to >= 1.10 to use "
+        "Context Parallel with THD format data"
+    )
+    import transformer_engine_torch as tex
+
+    return tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, cp_size, cp_rank)
diff --git a/megatron/core/extensions/transformer_engine_spec_provider.py b/megatron/core/extensions/transformer_engine_spec_provider.py
index 04228e02e88..c365fb4835d 100644
--- a/megatron/core/extensions/transformer_engine_spec_provider.py
+++ b/megatron/core/extensions/transformer_engine_spec_provider.py
@@ -19,6 +19,7 @@
 from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 from megatron.core.models.backends import BackendSpecProvider
 from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
+from megatron.core.transformer.dot_product_attention import DotProductAttention
 from megatron.core.transformer.mlp import MLPSubmodules, TEActivationFunctionBuilder
 from megatron.core.transformer.moe.experts import GroupedMLPSubmodules, SequentialMLP, TEGroupedMLP
 from megatron.core.transformer.moe.moe_layer import ExpertsBuilder
@@ -36,6 +37,10 @@ def __new__(cls, *args, **kwargs):
 class TESpecProvider(BackendSpecProvider):
     """A protocol for providing the submodules used in Spec building."""
 
+    def __init__(self, fallback_to_eager_attn: bool = False):
+        super().__init__()
+        self.fallback_to_eager_attn = fallback_to_eager_attn
+
     def linear(self) -> type:
         """Which linear module TE backend uses"""
         return TELinear
@@ -70,6 +75,8 @@ def layer_norm(
 
     def core_attention(self) -> type:
         """Which module to use for attention"""
+        if self.fallback_to_eager_attn:
+            return DotProductAttention
         return TEDotProductAttention
 
     def grouped_mlp_modules(self, moe_use_grouped_gemm: bool) -> ExpertsBuilder:
diff --git a/megatron/core/full_cuda_graph.py b/megatron/core/full_cuda_graph.py
index 7d790a07c6c..049bd7ce17c 100644
--- a/megatron/core/full_cuda_graph.py
+++ b/megatron/core/full_cuda_graph.py
@@ -2,6 +2,7 @@
 
 """Full iteration CUDA graph for training."""
 
+import gc
 import logging
 
 import torch
@@ -182,12 +183,10 @@ def __call__(self, *args, **kwargs):
             torch.cuda.synchronize()
             torch.distributed.barrier()
             logger.info(f'CUDA graph capture done for {training_str}!!!')
-
         if FullCudaGraphWrapper.cuda_graph[training_str] is None:
             FullCudaGraphWrapper.result[training_str] = self.forward_backward_func(*args, **kwargs)
         else:
             FullCudaGraphWrapper.cuda_graph[training_str].replay()
-
         self.next_iter(training_str)
         return FullCudaGraphWrapper.result[training_str]
 
@@ -198,3 +197,19 @@ def curr_iter(self, stage):
     def next_iter(self, stage):
         """Increment current training/validation iteration."""
         FullCudaGraphWrapper.curr_iteration[stage] += 1
+
+    def reset_cuda_graph(self, stage=None):
+        """Reset CUDA graph."""
+        if stage is None or stage == 'training':
+            if FullCudaGraphWrapper.cuda_graph['training'] is not None:
+                del FullCudaGraphWrapper.cuda_graph['training']
+                FullCudaGraphWrapper.cuda_graph['training'] = None
+            FullCudaGraphWrapper.result['training'] = None
+            FullCudaGraphWrapper.curr_iteration['training'] = 0
+        if stage is None or stage == 'validation':
+            if FullCudaGraphWrapper.cuda_graph['validation'] is not None:
+                del FullCudaGraphWrapper.cuda_graph['validation']
+                FullCudaGraphWrapper.cuda_graph['validation'] = None
+            FullCudaGraphWrapper.result['validation'] = None
+            FullCudaGraphWrapper.curr_iteration['validation'] = 0
+        gc.collect()
diff --git a/megatron/core/fusions/fused_bias_dropout.py b/megatron/core/fusions/fused_bias_dropout.py
index 2eb4007f75c..1f2448d86be 100644
--- a/megatron/core/fusions/fused_bias_dropout.py
+++ b/megatron/core/fusions/fused_bias_dropout.py
@@ -1,10 +1,13 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
-from typing import Optional, Tuple
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+from typing import TYPE_CHECKING, Optional, Tuple
 
 import torch
 
 from megatron.core.jit import jit_fuser
 
+if TYPE_CHECKING:
+    from megatron.core.tensor_parallel.random import CheckpointManager
+
 # pylint: disable=missing-function-docstring
 
 
@@ -80,7 +83,26 @@ def bias_dropout_add_fused_inference(
     return _bias_dropout_add_func(x_with_bias, residual, prob, False)
 
 
-def get_bias_dropout_add(training, fused):
+def get_bias_dropout_add(
+    training, fused, mhc_recompute_manager: Optional['CheckpointManager'] = None
+):
+    """
+    Get the bias-dropout-add function.
+
+    Args:
+        training: Whether in training mode.
+        fused: Whether to use fused implementation.
+        mhc_recompute_manager: Optional CheckpointManager for checkpoint management.
+            When provided, the returned function will wrap the BDA operation with
+            CheckpointWithoutOutput for memory-efficient recomputation.
+
+    Returns:
+        A callable that performs bias-dropout-add operation.
+    """
+    if mhc_recompute_manager is not None:
+        # Return a checkpointed version that handles tuple unpacking internally
+        return _get_checkpointed_bda(training, fused, mhc_recompute_manager)
+
     if fused:
         # jit scripting for a nn.module (with dropout) is not
         # triggering the fusion kernel. For now, we use two
@@ -92,3 +114,68 @@ def get_bias_dropout_add(training, fused):
             return bias_dropout_add_fused_inference
     else:
         return bias_dropout_add_unfused(training)
+
+
+def _get_checkpointed_bda(training, fused, mhc_recompute_manager: 'CheckpointManager'):
+    """
+    Create a checkpointed bias-dropout-add function.
+
+    This function handles:
+    1. Tuple unpacking for x_with_bias (required because save_for_backward can't save tuples)
+    2. Non-tensor arguments like dropout probability (handled by CheckpointWithoutOutput)
+    3. Auto-registration to the CheckpointManager
+
+    Args:
+        training: Whether in training mode.
+        fused: Whether to use fused implementation.
+        mhc_recompute_manager: CheckpointManager for checkpoint management.
+
+    Returns:
+        A callable that performs checkpointed bias-dropout-add operation.
+    """
+    from megatron.core.tensor_parallel.random import CheckpointWithoutOutput
+
+    # Get the underlying BDA function
+    if fused:
+        if training:
+            bda_func = bias_dropout_add_fused_train
+        else:
+            bda_func = bias_dropout_add_fused_inference
+    else:
+        bda_func = bias_dropout_add_unfused(training)
+
+    def _checkpointed_bda(x_with_bias, residual, prob):
+        """
+        Checkpointed BDA that handles tuple unpacking internally.
+
+        Args:
+            x_with_bias: Either a tuple (x, bias) or a single tensor x.
+            residual: Residual tensor.
+            prob: Dropout probability.
+
+        Returns:
+            Output tensor after bias-dropout-add.
+        """
+        # Create checkpoint with manager
+        ckpt = CheckpointWithoutOutput(ckpt_manager=mhc_recompute_manager)
+
+        # Handle case where x_with_bias might be a single tensor (e.g., from IdentityOp)
+        if isinstance(x_with_bias, tuple):
+            x, bias = x_with_bias
+        else:
+            x = x_with_bias
+            bias = None
+
+        # Wrapper function that re-packs the tuple for the actual BDA function
+        def _bda_wrapper(output, bias, res, dropout):
+            return bda_func((output, bias), res, dropout)
+
+        # Call checkpoint with unpacked arguments
+        result = ckpt.checkpoint(_bda_wrapper, x, bias, residual, prob)
+
+        # No-op when manager is set - manager handles all discarding uniformly
+        ckpt.discard_output_and_register_recompute(result)
+
+        return result
+
+    return _checkpointed_bda
diff --git a/megatron/core/fusions/fused_bias_swiglu.py b/megatron/core/fusions/fused_bias_swiglu.py
index 632470876c9..ec195551ffa 100644
--- a/megatron/core/fusions/fused_bias_swiglu.py
+++ b/megatron/core/fusions/fused_bias_swiglu.py
@@ -48,6 +48,23 @@ def weighted_swiglu(y, weights):
     return res.to(dtype)
 
 
+@jit_fuser
+def clamped_swiglu(y, clamp_value):
+    dtype = y.dtype
+    y_1, y_2 = torch.chunk(y.to(torch.float32), 2, -1)
+    y_1 = y_1.clamp(min=None, max=clamp_value)
+    y_2 = y_2.clamp(min=-clamp_value, max=clamp_value)
+    res = F.silu(y_1) * y_2
+    return res.to(dtype)
+
+
+@jit_fuser
+def clamped_weighted_swiglu(y, weights, clamp_value):
+    dtype = y.dtype
+    res = clamped_swiglu(y, clamp_value) * weights
+    return res.to(dtype)
+
+
 # gradient of tanh approximation of gelu
 # gradient of actual gelu is:
 # 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
@@ -97,6 +114,36 @@ def weighted_swiglu_back(g, y, weights):
     return input_grad.to(input_dtype), weights_grad.to(w_dtype)
 
 
+@jit_fuser
+def clamped_swiglu_back(g, y, clamp_value):
+    dtype = y.dtype
+    y_1, y_2 = torch.chunk(y.to(torch.float32), 2, -1)
+    y_1c = y_1.clamp(min=None, max=clamp_value)
+    y_2c = y_2.clamp(min=-clamp_value, max=clamp_value)
+    res = torch.cat(
+        (
+            g
+            * torch.sigmoid(y_1c)
+            * (1 + y_1c * (1 - torch.sigmoid(y_1c)))
+            * y_2c
+            * (y_1 <= clamp_value).to(g.dtype),
+            g * F.silu(y_1c) * ((y_2 >= -clamp_value) & (y_2 <= clamp_value)).to(g.dtype),
+        ),
+        -1,
+    )
+    return res.to(dtype)
+
+
+@jit_fuser
+def clamped_weighted_swiglu_back(g, y, weights, clamp_value):
+    input_dtype = y.dtype
+    w_dtype = weights.dtype
+    input_grad = clamped_swiglu_back(g * weights, y, clamp_value)
+    weights_grad = clamped_swiglu(y, clamp_value) * g.to(w_dtype)
+    weights_grad = torch.sum(weights_grad, dim=-1, keepdim=True)
+    return input_grad.to(input_dtype), weights_grad.to(w_dtype)
+
+
 class BiasSwiGLUFunction(torch.autograd.Function):
     """Custom autograd function for SwiGLU activation with bias support."""
 
@@ -190,20 +237,27 @@ def backward(ctx, grad_output):
 
 class WeightedSwiGLUFunction(torch.autograd.Function):
     @staticmethod
-    # bias is an optional argument
-    def forward(ctx, input, weights, fp8_input_store):
+    def forward(ctx, input, weights, fp8_input_store, clamp_value):
         input_for_backward = input.to(torch.float8_e4m3fn) if fp8_input_store else input
         ctx.save_for_backward(input_for_backward, weights)
         ctx.ori_input_dtype = input.dtype
         ctx.fp8_input_store = fp8_input_store
-        return weighted_swiglu(input, weights)
+        ctx.clamp_value = clamp_value
+        if clamp_value is not None and clamp_value > 0:
+            res = clamped_weighted_swiglu(input, weights, clamp_value)
+        else:
+            res = weighted_swiglu(input, weights)
+        return res
 
     @staticmethod
     def backward(ctx, grad_output):
         input, weights = ctx.saved_tensors
         input = input.to(ctx.ori_input_dtype) if ctx.fp8_input_store else input
-        tmp, wgrad = weighted_swiglu_back(grad_output, input, weights)
-        return tmp, wgrad, None
+        if ctx.clamp_value is not None and ctx.clamp_value > 0:
+            tmp, wgrad = clamped_weighted_swiglu_back(grad_output, input, weights, ctx.clamp_value)
+        else:
+            tmp, wgrad = weighted_swiglu_back(grad_output, input, weights)
+        return tmp, wgrad, None, None
 
 
 def bias_swiglu_impl(input, bias, fp8_input_store=False, cpu_offload_input=False):
@@ -236,7 +290,7 @@ def bias_swiglu_impl(input, bias, fp8_input_store=False, cpu_offload_input=False
     return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
 
 
-def weighted_bias_swiglu_impl(input, bias, weights, fp8_input_store=False):
+def weighted_bias_swiglu_impl(input, bias, weights, fp8_input_store=False, clamp_value=None):
     """
     Token-wise-weighted bias swiglu fusion.
     """
@@ -246,7 +300,7 @@ def weighted_bias_swiglu_impl(input, bias, weights, fp8_input_store=False):
     if bias is not None:
         raise NotImplementedError("Bias is not supported for weighted swiglu fusion")
     else:
-        output = WeightedSwiGLUFunction.apply(input, weights, fp8_input_store)
+        output = WeightedSwiGLUFunction.apply(input, weights, fp8_input_store, clamp_value)
 
     return output if len(ori_shape) == 2 else output.view(ori_shape[0], ori_shape[1], -1)
 
diff --git a/megatron/core/fusions/fused_linear_cross_entropy.py b/megatron/core/fusions/fused_linear_cross_entropy.py
new file mode 100644
index 00000000000..b533fef7aa3
--- /dev/null
+++ b/megatron/core/fusions/fused_linear_cross_entropy.py
@@ -0,0 +1,242 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+Linear Cross Entropy API
+Fuse cross entropy with linear layer.
+"""
+
+import typing
+from functools import lru_cache
+
+import torch
+
+
+class Platform:
+    """
+    Singleton class for targeted GPU platform.
+    """
+
+    _instance: typing.Optional["Platform"] = None
+
+    def __new__(cls) -> "Platform":
+        if cls._instance is None:
+            cls._instance = super().__new__(cls)
+        return cls._instance
+
+    def __init__(self) -> None:
+        if getattr(self, "_initialized", False):
+            return
+
+        assert torch.cuda.is_available(), "CUDA is not available"
+        device = torch.cuda.current_device()
+        cc = torch.cuda.get_device_capability(device)
+
+        if cc[0] == 10:
+            from .linear_cross_entropy.blackwell import entry as gpu_entry
+
+            self.forward_func: typing.Callable[..., typing.Any] = gpu_entry.forward
+            self.backward_func: typing.Callable[..., typing.Any] = gpu_entry.backward
+        else:
+            raise ValueError(f"Unsupported architecture: {cc[0]}")
+
+        self._initialized = True
+
+
+@lru_cache(maxsize=1)
+def _get_platform() -> Platform:
+    """
+    Helper function to lazy initialize the platform.
+    """
+    return Platform()
+
+
+class LinearCrossEntropy(torch.autograd.Function):
+    """
+    This class implements a custom autograd function for linear and cross entropy,
+    whose equivalent logic in PyTorch is:
+        ```python
+        def torch_entropy(hidden, weight, labels):
+            logits = torch.matmul(hidden, weight)
+            logprobs = torch.nn.functional.cross_entropy(logits, labels)
+            return logprobs
+        ```
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
+        reduction: typing.Literal["none", "sum", "mean"] = "mean",
+        ignore_index: int = -100,
+        sequence_parallel: bool = False,
+    ) -> torch.Tensor:
+        """
+        The forward pass of the Linear Cross Entropy.
+        If tp_group is not None, the weight tensor to each TP rank should be
+        (global_vocab_size // world_size, dim).
+        Note that each of the ranks should get equal shards along the vocab_size dimension.
+
+        Args:
+            @param hidden: the input tensor with shape (num_tokens, dim)
+            @param weight: the lm_head weight tensor with shape (local_vocab_size, dim)
+            @param labels: the labels tensor with shape (num_tokens,)
+            @param tp_group: the distributed process group for TP.
+            @param reduction: Default to "mean", and can be one of "none", "sum", "mean".
+            @param ignore_index: The index to ignore. Default to -100.
+            @param sequence_parallel: Whether to use sequence parallel. Default to False.
+        Returns:
+            @return: logprobs with shape
+                - either (num_tokens,) when reduction is "none"
+                - or (1,) when reduction is "mean" or "sum"
+
+        tp_group is None ----------------------------------> DP
+                B
+            A   C
+        tp_group is not None & sequence_parallel is False -> TP
+                B0  B1
+            A   C0  C1
+        tp_group is not None & sequence_parallel is True --> SP
+                B0  B1
+            A0  C0  XX
+            A1  XX  C1
+
+        When tp_group is not None, the weight tensor will be split along the vocab_size
+        dimension, which means each rank will get equal shards along the global_vocab_size
+        dimension. Specifically, the weight tensor to each rank will be (local_vocab_size, dim).
+        And there is an assumption that each rank will get the same local_vocab_size.
+
+        When sequence_parallel is True, the hidden tensor will be split along the
+        sequence length dimension, which means each rank will get equal shards along
+        the sequence length dimension. Specifically, the hidden tensor to each rank
+        will be (local_num_tokens, dim). And there is an assumption that each rank
+        will get the same local_num_tokens.
+
+        In TP forward pass, the hidden tensor and label tensor shall be identical
+        among all TP ranks, and it's user's responsibility to ensure the hidden tensor
+        is identical among all TP ranks. Then this operation will produce identical
+        logprobs among all TP ranks.
+
+        In TP backward pass, the gradient of the logprobs shall be identical among all
+        TP ranks, and it's user's responsibility to ensure the gradient of the logprobs
+        is identical among all TP ranks. Then this operation will produce distinct gradients
+        for the local weight tensor, and identical gradients for the hidden tensor.
+
+        ```python
+        # ------------ forward pass ------------ #
+        hidden = tp_group.broadcast(hidden, src=0) # handled by framework
+        labels = tp_group.broadcast(labels, src=0) # handled by framework
+        logprobs = linear_cross_entropy(...)
+        # each rank will get the same logprobs
+
+        # ------------ backward pass ------------ #
+        g_logprobs = tp_group.broadcast(g_logprobs, src=0) # handled by framework
+        d_hidden, d_weight = torch.autograd.grad(...)
+        # each rank will get the same d_hidden,
+        # and distinct d_weight for local weight shard
+        ```
+
+        In SP forward pass, the hidden tensor shall be split along the sequence length dimension,
+        and the label tensor shall be identical among all TP ranks.
+        Then this operation will produce identical logprobs among all TP ranks.
+
+        In SP backward pass, the gradient of the logprobs shall be identical among all TP ranks,
+        Then this operation will produce distinct gradients for the local hidden tensor
+        and local weight tensor.
+        ```python
+        # ------------ forward pass ------------ #
+        hidden = global_hidden[tp_rank] # handled by framework
+        labels = tp_group.broadcast(labels, src=0) # handled by framework
+        logprobs = linear_cross_entropy(...)
+        # each rank will get the same logprobs
+
+        # ------------ backward pass ------------ #
+        g_logprobs = tp_group.broadcast(g_logprobs, src=0) # handled by framework
+        d_hidden, d_weight = torch.autograd.grad(...)
+        # each rank will get distinct local d_hidden and d_weight
+        ```
+        """
+        with torch.cuda.nvtx.range("LinearCrossEntropy-forward"):
+            (
+                logprobs,
+                _maximum,
+                _acc,
+                _num_valid_tokens,
+                tp_rank,
+                tp_world_size,
+                global_hidden,
+            ) = _get_platform().forward_func(
+                hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel
+            )
+            ctx.save_for_backward(global_hidden, weight, labels, _maximum, _acc, _num_valid_tokens)
+            ctx.tp_group = tp_group
+            ctx.ignore_index = ignore_index
+            ctx.reduction = reduction
+            ctx.tp_rank = tp_rank
+            ctx.tp_world_size = tp_world_size
+            ctx.sequence_parallel = sequence_parallel
+
+        return logprobs
+
+    @staticmethod
+    def backward(
+        ctx, dlogprobs: torch.Tensor
+    ) -> typing.Tuple[torch.Tensor, torch.Tensor, None, None, None, None, None]:
+        """
+        The backward pass of the Linear Cross Entropy.
+        Args:
+            dlogprobs (torch.Tensor): The gradient of the cross entropy, with shape
+                - either (num_tokens,) when reduction is "none"
+                - or (1,) when reduction is "mean" or "sum"
+        Returns:
+            dhidden (torch.Tensor): The gradient of the hidden.
+            dweight (torch.Tensor): The gradient of the weight.
+        """
+        with torch.cuda.nvtx.range("LinearCrossEntropy-backward"):
+            (global_hidden, weight, labels, _maximum, _accu, _num_valid_tokens) = ctx.saved_tensors
+
+            tp_group = ctx.tp_group
+            ignore_index = ctx.ignore_index
+            reduction = ctx.reduction
+            tp_rank = ctx.tp_rank
+            tp_world_size = ctx.tp_world_size
+            sequence_parallel = ctx.sequence_parallel
+
+            d_hidden, d_weight = _get_platform().backward_func(
+                dlogprobs,
+                global_hidden,
+                weight,
+                labels,
+                _maximum,
+                _accu,
+                _num_valid_tokens,
+                reduction,
+                ignore_index,
+                tp_group,
+                tp_rank,
+                tp_world_size,
+                sequence_parallel,
+            )
+
+        return d_hidden, d_weight, None, None, None, None, None
+
+
+def linear_cross_entropy(
+    hidden: torch.Tensor,
+    weight: torch.Tensor,
+    labels: torch.Tensor,
+    tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
+    reduction: typing.Literal["none", "sum", "mean"] = "mean",
+    ignore_index: int = -100,
+    sequence_parallel: bool = False,
+) -> torch.Tensor:
+    """
+    helper function for linear cross entropy.
+    """
+    _impl = LinearCrossEntropy.apply
+    return _impl(hidden, weight, labels, tp_group, reduction, ignore_index, sequence_parallel)
+
+
+__all__ = ["linear_cross_entropy", "LinearCrossEntropy"]
diff --git a/megatron/core/fusions/fused_mhc_kernels.py b/megatron/core/fusions/fused_mhc_kernels.py
new file mode 100644
index 00000000000..6a19255196a
--- /dev/null
+++ b/megatron/core/fusions/fused_mhc_kernels.py
@@ -0,0 +1,964 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""Fused cuTile kernels for mHC (Manifold-Constrained Hyper-Connections).
+
+Requires cuda.tile (cuTile) for optimal performance on supported GPUs
+(compute capability 10.x+).  Reference (non-fused) implementations live in
+``megatron.core.transformer.hyper_connection`` and are used when cuTile is
+unavailable or when the ``use_fused_mhc`` config flag is False.
+
+Four fused operations:
+  - sinkhorn:     Sinkhorn-Knopp projection to doubly stochastic matrix
+  - h_aggregate:  weighted n-stream -> 1-stream aggregation
+  - h_post_bda:   fused H_res @ residual + H_post * (x + bias)
+  - proj_rms:     fused projection + RMS normalization
+"""
+
+import math
+from typing import Optional, Tuple
+
+import torch
+from torch import Tensor
+
+# ---------------------------------------------------------------------------
+# Check cuTile availability
+# ---------------------------------------------------------------------------
+_CUTILE_AVAILABLE = False
+try:
+    import cuda.tile as ct
+
+    _CUTILE_AVAILABLE = True
+except ImportError:
+    pass
+
+
+def is_cutile_available() -> bool:
+    """Return True if cuTile fused kernels are available."""
+    return _CUTILE_AVAILABLE
+
+
+# ============================================================================
+# CuTile implementations (only defined when cuda.tile is available)
+# ============================================================================
+
+if _CUTILE_AVAILABLE:
+    ConstInt = ct.Constant[int]
+    PAD_ZERO = ct.PaddingMode.ZERO
+    LOG2E = 1.4426950408889634
+
+    # -- Sinkhorn kernels ----------------------------------------------------
+
+    @ct.kernel
+    def _ct_sinkhorn_fwd_kernel(
+        inp, out, M_init_out, eps, HC: ConstInt, NUM_ITERS: ConstInt, TILE_SIZE: ConstInt
+    ):
+        pid = ct.bid(0)
+        logits = ct.load(inp, index=(pid, 0, 0), shape=(TILE_SIZE, HC, HC)).astype(ct.float32)
+        row_max = ct.max(logits, axis=2, keepdims=True)
+        M = ct.exp2((logits - row_max) * LOG2E)
+        ct.store(
+            M_init_out,
+            index=(pid, 0, 0),
+            tile=ct.reshape(M.astype(M_init_out.dtype), (TILE_SIZE, HC, HC)),
+        )
+        for _ in range(NUM_ITERS):
+            row_sum = ct.sum(M, axis=2, keepdims=True)
+            M = M / (row_sum + eps)
+            col_sum = ct.sum(M, axis=1, keepdims=True)
+            M = M / (col_sum + eps)
+        ct.store(out, index=(pid, 0, 0), tile=ct.reshape(M.astype(out.dtype), (TILE_SIZE, HC, HC)))
+
+    @ct.kernel
+    def _ct_sinkhorn_bwd_kernel(
+        grad_out,
+        M_init,
+        grad_inp,
+        ws_M,
+        ws_rs,
+        ws_cs,
+        eps,
+        HC: ConstInt,
+        NUM_ITERS: ConstInt,
+        TILE_SIZE: ConstInt,
+    ):
+        pid = ct.bid(0)
+        M_base = pid * (2 * NUM_ITERS)
+        v_base = pid * NUM_ITERS
+
+        M = ct.load(M_init, index=(pid, 0, 0), shape=(TILE_SIZE, HC, HC)).astype(ct.float32)
+        for t in range(NUM_ITERS):
+            ct.store(ws_M, index=(M_base + 2 * t, 0, 0), tile=M)
+            row_sum = ct.sum(M, axis=2, keepdims=True)
+            ct.store(ws_rs, index=(v_base + t, 0, 0), tile=row_sum)
+            M = M / (row_sum + eps)
+            ct.store(ws_M, index=(M_base + 2 * t + 1, 0, 0), tile=M)
+            col_sum = ct.sum(M, axis=1, keepdims=True)
+            ct.store(ws_cs, index=(v_base + t, 0, 0), tile=col_sum)
+            M = M / (col_sum + eps)
+
+        grad = ct.load(grad_out, index=(pid, 0, 0), shape=(TILE_SIZE, HC, HC)).astype(ct.float32)
+        for t_rev in range(NUM_ITERS):
+            t = NUM_ITERS - 1 - t_rev
+            col_s = ct.load(ws_cs, index=(v_base + t, 0, 0), shape=(TILE_SIZE, 1, HC))
+            grad = grad / (col_s + eps)
+            col_corr = ct.sum(grad * M, axis=1, keepdims=True)
+            grad = grad - col_corr
+            M = ct.load(ws_M, index=(M_base + 2 * t + 1, 0, 0), shape=(TILE_SIZE, HC, HC))
+            row_s = ct.load(ws_rs, index=(v_base + t, 0, 0), shape=(TILE_SIZE, HC, 1))
+            grad = grad / (row_s + eps)
+            row_corr = ct.sum(grad * M, axis=2, keepdims=True)
+            grad = grad - row_corr
+            M = ct.load(ws_M, index=(M_base + 2 * t, 0, 0), shape=(TILE_SIZE, HC, HC))
+        grad = grad * M
+        ct.store(grad_inp, index=(pid, 0, 0), tile=grad.astype(grad_inp.dtype))
+
+    def _cutile_sinkhorn_fwd(
+        input_logits: Tensor, num_iterations: int, eps: float = 1e-8
+    ) -> Tuple[Tensor, Tensor]:
+        original_shape = input_logits.shape
+        hc = original_shape[-1]
+        N_batch = input_logits.numel() // (hc * hc)
+        TILE_SIZE = math.gcd(N_batch, 128)
+        dev = input_logits.device
+        out = torch.empty(N_batch, hc, hc, dtype=input_logits.dtype, device=dev)
+        M_init = torch.empty(N_batch, hc, hc, dtype=input_logits.dtype, device=dev)
+        ct.launch(
+            torch.cuda.current_stream(),
+            (math.ceil(N_batch / TILE_SIZE), 1, 1),
+            _ct_sinkhorn_fwd_kernel,
+            (input_logits.view(N_batch, hc, hc), out, M_init, eps, hc, num_iterations, TILE_SIZE),
+        )
+        return out.view(original_shape), M_init.view(original_shape)
+
+    def _cutile_sinkhorn_bwd(
+        grad_output: Tensor, M_init: Tensor, num_iterations: int, eps: float = 1e-8
+    ) -> Tensor:
+        original_shape = grad_output.shape
+        hc = original_shape[-1]
+        N_batch = grad_output.numel() // (hc * hc)
+        TILE_SIZE = math.gcd(N_batch, 128)
+        dev = grad_output.device
+        ws_M = torch.empty(N_batch * 2 * num_iterations, hc, hc, dtype=torch.float32, device=dev)
+        ws_rs = torch.empty(N_batch * num_iterations, hc, 1, dtype=torch.float32, device=dev)
+        ws_cs = torch.empty(N_batch * num_iterations, 1, hc, dtype=torch.float32, device=dev)
+        grad_input = torch.empty(N_batch, hc, hc, dtype=grad_output.dtype, device=dev)
+        ct.launch(
+            torch.cuda.current_stream(),
+            (math.ceil(N_batch / TILE_SIZE), 1, 1),
+            _ct_sinkhorn_bwd_kernel,
+            (
+                grad_output.view(N_batch, hc, hc),
+                M_init.view(N_batch, hc, hc),
+                grad_input,
+                ws_M,
+                ws_rs,
+                ws_cs,
+                eps,
+                hc,
+                num_iterations,
+                TILE_SIZE,
+            ),
+        )
+        return grad_input.view(original_shape)
+
+    # -- H_aggregate kernels -------------------------------------------------
+
+    @ct.kernel
+    def _ct_h_agg_fwd_kernel(x, h_pre, out, N: ConstInt, TILE_M: ConstInt, TILE_C: ConstInt):
+        pid = ct.bid(0)
+        num_tiles = ct.num_tiles(x, axis=2, shape=(TILE_M, N, TILE_C))
+        h_tile = ct.load(h_pre, index=(pid, 0), shape=(TILE_M, N), padding_mode=PAD_ZERO)
+        h_tile = ct.expand_dims(h_tile, axis=2)
+        for j in range(num_tiles):
+            x_tile = ct.load(x, index=(pid, 0, j), shape=(TILE_M, N, TILE_C), padding_mode=PAD_ZERO)
+            acc = ct.sum(x_tile * h_tile, axis=1).astype(ct.float32)
+            ct.store(out, index=(pid, j), tile=acc.astype(out.dtype))
+
+    @ct.kernel
+    def _ct_h_agg_bwd_kernel(go, x, h_pre, gx, gh, N: ConstInt, TILE_M: ConstInt, TILE_C: ConstInt):
+        pid = ct.bid(0)
+        num_c_tiles = ct.num_tiles(go, axis=1, shape=(TILE_M, TILE_C))
+        h_tile = ct.load(h_pre, index=(pid, 0), shape=(TILE_M, N), padding_mode=PAD_ZERO)
+        h_expanded = ct.expand_dims(h_tile, axis=2)
+        gh_acc = ct.full((TILE_M, N), 0, dtype=ct.float32)
+        for ct_idx in range(num_c_tiles):
+            go_tile = ct.load(
+                go, index=(pid, ct_idx), shape=(TILE_M, TILE_C), padding_mode=PAD_ZERO
+            )
+            go_expanded = ct.expand_dims(go_tile, axis=1)
+            x_tile = ct.load(
+                x, index=(pid, 0, ct_idx), shape=(TILE_M, N, TILE_C), padding_mode=PAD_ZERO
+            )
+            gx_tile = go_expanded * h_expanded
+            ct.store(gx, index=(pid, 0, ct_idx), tile=gx_tile.astype(gx.dtype))
+            gh_acc += ct.sum(go_expanded * x_tile, axis=2)
+        ct.store(gh, index=(pid, 0), tile=gh_acc.astype(gh.dtype))
+
+    def _cutile_h_aggregate_fwd(x: Tensor, h_pre: Tensor) -> Tensor:
+        s, b, n, C = x.shape
+        sb = s * b
+        TILE_SIZE = math.gcd(sb, 4)
+        TILE_C = math.gcd(C, 1024)
+        out = torch.empty(sb, C, dtype=x.dtype, device=x.device)
+        ct.launch(
+            torch.cuda.current_stream(),
+            (math.ceil(sb / TILE_SIZE),),
+            _ct_h_agg_fwd_kernel,
+            (x.view(sb, n, C), h_pre.view(sb, n), out, n, TILE_SIZE, TILE_C),
+        )
+        return out.view(s, b, C)
+
+    def _cutile_h_aggregate_bwd(
+        grad_output: Tensor, x: Tensor, h_pre: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        s, b, n, C = x.shape
+        sb = s * b
+        TILE_C = math.gcd(C, 1024)
+        TILE_M = math.gcd(sb, 4)
+        gx = torch.empty(sb, n, C, dtype=x.dtype, device=x.device)
+        gh = torch.empty(sb, n, dtype=x.dtype, device=x.device)
+        ct.launch(
+            torch.cuda.current_stream(),
+            (math.ceil(sb / TILE_M),),
+            _ct_h_agg_bwd_kernel,
+            (
+                grad_output.view(sb, C),
+                x.view(sb, n, C),
+                h_pre.view(sb, n),
+                gx,
+                gh,
+                n,
+                TILE_M,
+                TILE_C,
+            ),
+        )
+        return gx.view(s, b, n, C), gh.view(s, b, n)
+
+    # -- H_post BDA kernels --------------------------------------------------
+
+    @ct.kernel
+    def _ct_hpb_fwd_kernel(
+        hr, orig, hp, x, out, N: ConstInt, TILE_C: ConstInt, TILE_SIZE: ConstInt
+    ):
+        pid = ct.bid(0)
+        num_c_tiles = ct.num_tiles(x, axis=1, shape=(TILE_SIZE, TILE_C))
+        hp_tile = ct.load(hp, index=(pid, 0), shape=(TILE_SIZE, N), padding_mode=PAD_ZERO)
+        hp_2d = ct.reshape(hp_tile, (N, 1))
+        hr_tile = ct.load(hr, index=(pid, 0, 0), shape=(TILE_SIZE, N, N), padding_mode=PAD_ZERO)
+        hr_2d = ct.reshape(hr_tile, (N, N))
+        for ct_idx in range(num_c_tiles):
+            orig_tile = ct.load(
+                orig, index=(pid, 0, ct_idx), shape=(TILE_SIZE, N, TILE_C), padding_mode=PAD_ZERO
+            )
+            orig_2d = ct.reshape(orig_tile, (N, TILE_C))
+            x_tile = ct.load(
+                x, index=(pid, ct_idx), shape=(TILE_SIZE, TILE_C), padding_mode=PAD_ZERO
+            )
+            x_2d = ct.reshape(x_tile, (1, TILE_C))
+            out_2d = hp_2d * x_2d
+            for j in range(N):
+                out_2d += ct.extract(hr_2d, (0, j), shape=(N, 1)) * ct.extract(
+                    orig_2d, (j, 0), shape=(1, TILE_C)
+                )
+            ct.store(
+                out,
+                index=(pid, 0, ct_idx),
+                tile=ct.reshape(out_2d, (TILE_SIZE, N, TILE_C)).astype(out.dtype),
+            )
+
+    @ct.kernel
+    def _ct_hpb_fwd_bias_kernel(
+        hr, orig, hp, x, bias, out, N: ConstInt, TILE_C: ConstInt, TILE_SIZE: ConstInt
+    ):
+        pid = ct.bid(0)
+        num_c_tiles = ct.num_tiles(x, axis=1, shape=(TILE_SIZE, TILE_C))
+        hp_tile = ct.load(hp, index=(pid, 0), shape=(TILE_SIZE, N), padding_mode=PAD_ZERO)
+        hp_2d = ct.reshape(hp_tile, (N, 1))
+        hr_tile = ct.load(hr, index=(pid, 0, 0), shape=(TILE_SIZE, N, N), padding_mode=PAD_ZERO)
+        hr_2d = ct.reshape(hr_tile, (N, N))
+        for ct_idx in range(num_c_tiles):
+            orig_tile = ct.load(
+                orig, index=(pid, 0, ct_idx), shape=(TILE_SIZE, N, TILE_C), padding_mode=PAD_ZERO
+            )
+            orig_2d = ct.reshape(orig_tile, (N, TILE_C))
+            x_tile = ct.load(
+                x, index=(pid, ct_idx), shape=(TILE_SIZE, TILE_C), padding_mode=PAD_ZERO
+            )
+            bias_tile = ct.load(bias, index=(ct_idx,), shape=(TILE_C,), padding_mode=PAD_ZERO)
+            xb_2d = ct.reshape(x_tile, (1, TILE_C)) + ct.reshape(bias_tile, (1, TILE_C))
+            out_2d = hp_2d * xb_2d
+            for j in range(N):
+                out_2d += ct.extract(hr_2d, (0, j), shape=(N, 1)) * ct.extract(
+                    orig_2d, (j, 0), shape=(1, TILE_C)
+                )
+            ct.store(
+                out,
+                index=(pid, 0, ct_idx),
+                tile=ct.reshape(out_2d, (TILE_SIZE, N, TILE_C)).astype(out.dtype),
+            )
+
+    @ct.kernel
+    def _ct_hpb_bwd_kernel(
+        go,
+        hr,
+        orig,
+        hp,
+        x,
+        g_hr,
+        g_orig,
+        g_hp,
+        g_x,
+        N: ConstInt,
+        TILE_C: ConstInt,
+        TILE_SIZE: ConstInt,
+    ):
+        pid = ct.bid(0)
+        num_c_tiles = ct.cdiv(go.shape[2], TILE_C)
+        hp_tile = ct.load(hp, index=(pid, 0), shape=(TILE_SIZE, N))
+        hp_2d = ct.reshape(hp_tile, (1, N))
+        hr_tile = ct.load(hr, index=(pid, 0, 0), shape=(TILE_SIZE, N, N), padding_mode=PAD_ZERO)
+        hr_2d = ct.reshape(hr_tile, (N, N))
+        acc_g_hp_2d = ct.full((N, 1), 0, dtype=ct.float32)
+        acc_g_hr_2d = ct.full((N, N), 0, dtype=ct.float32)
+        for ct_idx in range(num_c_tiles):
+            x_tile = ct.load(
+                x, index=(pid, ct_idx), shape=(TILE_SIZE, TILE_C), padding_mode=PAD_ZERO
+            )
+            x_2d = ct.reshape(x_tile, (1, TILE_C))
+            go_tile = ct.load(
+                go, index=(pid, 0, ct_idx), shape=(TILE_SIZE, N, TILE_C), padding_mode=PAD_ZERO
+            )
+            go_2d = ct.reshape(go_tile, (N, TILE_C))
+            orig_tile = ct.load(
+                orig, index=(pid, 0, ct_idx), shape=(TILE_SIZE, N, TILE_C), padding_mode=PAD_ZERO
+            )
+            orig_2d = ct.reshape(orig_tile, (N, TILE_C))
+            g_x_2d = ct.full((1, TILE_C), 0, dtype=hp.dtype)
+            g_orig_2d = ct.full((N, TILE_C), 0, dtype=hp.dtype)
+            for j in range(N):
+                g_x_2d += ct.extract(hp_2d, (0, j), shape=(1, 1)).item() * ct.extract(
+                    go_2d, (j, 0), shape=(1, TILE_C)
+                )
+                g_orig_2d += ct.extract(hr_2d, (j, 0), shape=(1, N)).reshape((N, 1)) * ct.extract(
+                    go_2d, (j, 0), shape=(1, TILE_C)
+                )
+            acc_g_hp_2d += ct.sum(go_2d * x_2d, axis=1, keepdims=True)
+            acc_g_hr_2d += ct.sum(
+                ct.expand_dims(go_2d, axis=1) * ct.expand_dims(orig_2d, axis=0), axis=2
+            )
+            ct.store(
+                g_x,
+                index=(pid, ct_idx),
+                tile=ct.reshape(g_x_2d, (TILE_SIZE, TILE_C)).astype(g_x.dtype),
+            )
+            ct.store(
+                g_orig,
+                index=(pid, 0, ct_idx),
+                tile=ct.reshape(g_orig_2d, (TILE_SIZE, N, TILE_C)).astype(g_orig.dtype),
+            )
+        ct.store(
+            g_hp, index=(pid, 0), tile=ct.reshape(acc_g_hp_2d, (TILE_SIZE, N)).astype(g_hp.dtype)
+        )
+        ct.store(
+            g_hr,
+            index=(pid, 0, 0),
+            tile=ct.reshape(acc_g_hr_2d, (TILE_SIZE, N, N)).astype(g_hr.dtype),
+        )
+
+    @ct.kernel
+    def _ct_hpb_bwd_bias_kernel(
+        go,
+        hr,
+        orig,
+        hp,
+        x,
+        bias,
+        g_hr,
+        g_orig,
+        g_hp,
+        g_x,
+        N: ConstInt,
+        TILE_C: ConstInt,
+        TILE_SIZE: ConstInt,
+    ):
+        pid = ct.bid(0)
+        num_c_tiles = ct.cdiv(go.shape[2], TILE_C)
+        hp_tile = ct.load(hp, index=(pid, 0), shape=(TILE_SIZE, N))
+        hp_2d = ct.reshape(hp_tile, (1, N))
+        hr_tile = ct.load(hr, index=(pid, 0, 0), shape=(TILE_SIZE, N, N), padding_mode=PAD_ZERO)
+        hr_2d = ct.reshape(hr_tile, (N, N))
+        acc_g_hp_2d = ct.full((N, 1), 0, dtype=ct.float32)
+        acc_g_hr_2d = ct.full((N, N), 0, dtype=ct.float32)
+        for ct_idx in range(num_c_tiles):
+            x_tile = ct.load(
+                x, index=(pid, ct_idx), shape=(TILE_SIZE, TILE_C), padding_mode=PAD_ZERO
+            )
+            bias_tile = ct.load(bias, index=(ct_idx,), shape=(TILE_C,), padding_mode=PAD_ZERO)
+            xb_2d = ct.reshape(x_tile, (1, TILE_C)) + ct.reshape(bias_tile, (1, TILE_C))
+            go_tile = ct.load(
+                go, index=(pid, 0, ct_idx), shape=(TILE_SIZE, N, TILE_C), padding_mode=PAD_ZERO
+            )
+            go_2d = ct.reshape(go_tile, (N, TILE_C))
+            orig_tile = ct.load(
+                orig, index=(pid, 0, ct_idx), shape=(TILE_SIZE, N, TILE_C), padding_mode=PAD_ZERO
+            )
+            orig_2d = ct.reshape(orig_tile, (N, TILE_C))
+            g_x_2d = ct.full((1, TILE_C), 0, dtype=hp.dtype)
+            g_orig_2d = ct.full((N, TILE_C), 0, dtype=hp.dtype)
+            for j in range(N):
+                g_x_2d += ct.extract(hp_2d, (0, j), shape=(1, 1)).item() * ct.extract(
+                    go_2d, (j, 0), shape=(1, TILE_C)
+                )
+                g_orig_2d += ct.extract(hr_2d, (j, 0), shape=(1, N)).reshape((N, 1)) * ct.extract(
+                    go_2d, (j, 0), shape=(1, TILE_C)
+                )
+            acc_g_hp_2d += ct.sum(go_2d * xb_2d, axis=1, keepdims=True)
+            acc_g_hr_2d += ct.sum(
+                ct.expand_dims(go_2d, axis=1) * ct.expand_dims(orig_2d, axis=0), axis=2
+            )
+            ct.store(
+                g_x,
+                index=(pid, ct_idx),
+                tile=ct.reshape(g_x_2d, (TILE_SIZE, TILE_C)).astype(g_x.dtype),
+            )
+            ct.store(
+                g_orig,
+                index=(pid, 0, ct_idx),
+                tile=ct.reshape(g_orig_2d, (TILE_SIZE, N, TILE_C)).astype(g_orig.dtype),
+            )
+        ct.store(
+            g_hp, index=(pid, 0), tile=ct.reshape(acc_g_hp_2d, (TILE_SIZE, N)).astype(g_hp.dtype)
+        )
+        ct.store(
+            g_hr,
+            index=(pid, 0, 0),
+            tile=ct.reshape(acc_g_hr_2d, (TILE_SIZE, N, N)).astype(g_hr.dtype),
+        )
+
+    def _cutile_h_post_bda_fwd(
+        h_res: Tensor, original_residual: Tensor, h_post: Tensor, x: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        s, b, n, C = original_residual.shape
+        sb = s * b
+        TILE_C = math.gcd(C, 1024)
+        TILE_SIZE = math.gcd(sb, 1)
+        out = torch.empty(sb, n, C, dtype=h_res.dtype, device=h_res.device)
+        grid = (math.ceil(sb / TILE_SIZE),)
+        if bias is not None:
+            ct.launch(
+                torch.cuda.current_stream(),
+                grid,
+                _ct_hpb_fwd_bias_kernel,
+                (
+                    h_res.view(sb, n, n),
+                    original_residual.view(sb, n, C),
+                    h_post.view(sb, n),
+                    x.view(sb, C),
+                    bias,
+                    out,
+                    n,
+                    TILE_C,
+                    TILE_SIZE,
+                ),
+            )
+        else:
+            ct.launch(
+                torch.cuda.current_stream(),
+                grid,
+                _ct_hpb_fwd_kernel,
+                (
+                    h_res.view(sb, n, n),
+                    original_residual.view(sb, n, C),
+                    h_post.view(sb, n),
+                    x.view(sb, C),
+                    out,
+                    n,
+                    TILE_C,
+                    TILE_SIZE,
+                ),
+            )
+        return out.view(s, b, n, C)
+
+    def _cutile_h_post_bda_bwd(
+        grad_output: Tensor,
+        h_res: Tensor,
+        original_residual: Tensor,
+        h_post: Tensor,
+        x: Tensor,
+        bias: Optional[Tensor],
+    ) -> Tuple[Tensor, Tensor, Tensor, Tensor, Optional[Tensor]]:
+        s, b, n, C = original_residual.shape
+        sb = s * b
+        TILE_C = math.gcd(C, 1024)
+        TILE_SIZE = math.gcd(sb, 1)
+        g_hr = torch.empty(sb, n, n, dtype=h_res.dtype, device=h_res.device)
+        g_res = torch.empty(sb, n, C, dtype=h_res.dtype, device=h_res.device)
+        g_hp = torch.empty(sb, n, dtype=h_res.dtype, device=h_res.device)
+        g_x = torch.empty(sb, C, dtype=h_res.dtype, device=h_res.device)
+        grid = (sb,)
+        if bias is not None:
+            ct.launch(
+                torch.cuda.current_stream(),
+                grid,
+                _ct_hpb_bwd_bias_kernel,
+                (
+                    grad_output.view(sb, n, C),
+                    h_res.view(sb, n, n),
+                    original_residual.view(sb, n, C),
+                    h_post.view(sb, n),
+                    x.view(sb, C),
+                    bias,
+                    g_hr,
+                    g_res,
+                    g_hp,
+                    g_x,
+                    n,
+                    TILE_C,
+                    TILE_SIZE,
+                ),
+            )
+        else:
+            ct.launch(
+                torch.cuda.current_stream(),
+                grid,
+                _ct_hpb_bwd_kernel,
+                (
+                    grad_output.view(sb, n, C),
+                    h_res.view(sb, n, n),
+                    original_residual.view(sb, n, C),
+                    h_post.view(sb, n),
+                    x.view(sb, C),
+                    g_hr,
+                    g_res,
+                    g_hp,
+                    g_x,
+                    n,
+                    TILE_C,
+                    TILE_SIZE,
+                ),
+            )
+        g_bias = g_x.sum(dim=0) if bias is not None else None
+        return (
+            g_hr.view(s, b, n, n),
+            g_res.view(s, b, n, C),
+            g_hp.view(s, b, n),
+            g_x.view(s, b, C),
+            g_bias,
+        )
+
+    # -- Proj RMS kernels ----------------------------------------------------
+
+    @ct.function
+    def _ct_rms_dnorm(a_tile, norm_tile, dr_tile, K):
+        inv_norm = ct.where(norm_tile > 0, 1.0 / norm_tile, 0.0)
+        inv_sqrt_k = 1.0 / ct.sqrt(K)
+        eps = 1e-8
+        u = norm_tile * inv_sqrt_k + eps
+        coeff = -(1.0 / (u * u)) * inv_sqrt_k
+        return dr_tile * coeff * a_tile * inv_norm
+
+    @ct.kernel
+    def _ct_proj_rms_fwd_kernel(
+        A,
+        B,
+        PROJ,
+        NORM,
+        R,
+        M: int,
+        N: int,
+        K: int,
+        eps: float,
+        TILE_M: ConstInt,
+        TILE_N: ConstInt,
+        TILE_K: ConstInt,
+    ):
+        tile_m_id = ct.bid(0)
+        num_k_tiles = ct.cdiv(K, TILE_K)
+        acc = ct.full((TILE_M, TILE_N), 0.0, dtype=ct.float32)
+        sum_sq = ct.full((TILE_M, 1), 0.0, dtype=ct.float32)
+        for tile_k_id in range(num_k_tiles):
+            a_tile = ct.load(
+                A, index=(tile_m_id, tile_k_id), shape=(TILE_M, TILE_K), padding_mode=PAD_ZERO
+            )
+            b_tile = ct.load(B, index=(0, tile_k_id), shape=(TILE_N, TILE_K), padding_mode=PAD_ZERO)
+            acc = ct.mma(
+                a_tile.astype(ct.tfloat32), b_tile.transpose().astype(ct.tfloat32), acc=acc
+            )
+            sum_sq += ct.sum(a_tile * a_tile, axis=1, keepdims=True)
+        norm_tile = ct.sqrt(sum_sq)
+        v = norm_tile / ct.sqrt(K) + eps
+        r_tile = 1.0 / v
+        ct.store(PROJ, index=(tile_m_id, 0), tile=acc.astype(PROJ.dtype))
+        ct.store(NORM, index=(tile_m_id, 0), tile=norm_tile.astype(NORM.dtype))
+        ct.store(R, index=(tile_m_id, 0), tile=r_tile.astype(R.dtype))
+
+    @ct.kernel
+    def _ct_proj_rms_bwd_kernel(
+        A,
+        B,
+        NORM,
+        DD,
+        DR,
+        DA,
+        DB,
+        M: int,
+        N: int,
+        K: int,
+        TILE_SIZE_M: ConstInt,
+        TILE_SIZE_N: ConstInt,
+        TILE_SIZE_K: ConstInt,
+    ):
+        zero_pad = ct.PaddingMode.ZERO
+        tile_k_id = ct.bid(0)
+        NUM_M_TILES = ct.cdiv(M, TILE_SIZE_M)
+        accumulator_db = ct.full((TILE_SIZE_K, TILE_SIZE_N), 0.0, dtype=ct.float32)
+        for tile_m_id in range(NUM_M_TILES):
+            accumulator_da = ct.full((TILE_SIZE_M, TILE_SIZE_K), 0.0, dtype=ct.float32)
+            a_tile = ct.load(
+                A,
+                index=(tile_m_id, tile_k_id),
+                shape=(TILE_SIZE_M, TILE_SIZE_K),
+                padding_mode=zero_pad,
+            )
+            norm_tile = ct.load(
+                NORM, index=(tile_m_id, 0), shape=(TILE_SIZE_M, 1), padding_mode=zero_pad
+            )
+            dr_tile = ct.load(
+                DR, index=(tile_m_id, 0), shape=(TILE_SIZE_M, 1), padding_mode=zero_pad
+            )
+            accumulator_da = accumulator_da + _ct_rms_dnorm(a_tile, norm_tile, dr_tile, K)
+            b_tile = ct.load(
+                B, index=(0, tile_k_id), shape=(TILE_SIZE_N, TILE_SIZE_K), padding_mode=zero_pad
+            )
+            dd_tile = ct.load(
+                DD, index=(tile_m_id, 0), shape=(TILE_SIZE_M, TILE_SIZE_N), padding_mode=zero_pad
+            )
+            dd_tile = ct.astype(dd_tile, ct.tfloat32)
+            accumulator_da = ct.mma(dd_tile, b_tile.astype(ct.tfloat32), acc=accumulator_da)
+            ct.store(DA, index=(tile_m_id, tile_k_id), tile=accumulator_da.astype(DA.dtype))
+            accumulator_db = ct.mma(
+                a_tile.transpose().astype(ct.tfloat32), dd_tile, acc=accumulator_db
+            )
+        ct.store(DB, index=(0, tile_k_id), tile=accumulator_db.transpose().astype(DB.dtype))
+
+    @ct.kernel
+    def _ct_proj_rms_bwd_small_k_kernel(
+        A, B, NORM, DD, DR, DA, DB, M: int, N: int, K: int, TILE_N_SIZE: ConstInt
+    ):
+        zero_pad = ct.PaddingMode.ZERO
+        TILE_DB_SIZE_M = 128
+        TILE_DB_SIZE_K = 64
+        NUM_M_TILES = ct.cdiv(M, TILE_DB_SIZE_M)
+        NUM_K_TILES = ct.cdiv(K, TILE_DB_SIZE_K)
+        if ct.bid(1) == 0:
+            for tile_id in range(ct.bid(0), NUM_K_TILES, ct.num_blocks(0)):
+                accumulator_db = ct.full((TILE_DB_SIZE_K, TILE_N_SIZE), 0.0, dtype=ct.float32)
+                for m_tile in range(NUM_M_TILES):
+                    a_tile = ct.load(
+                        A,
+                        index=(m_tile, tile_id),
+                        shape=(TILE_DB_SIZE_M, TILE_DB_SIZE_K),
+                        padding_mode=zero_pad,
+                    )
+                    dd_tile = ct.load(
+                        DD,
+                        index=(m_tile, 0),
+                        shape=(TILE_DB_SIZE_M, TILE_N_SIZE),
+                        padding_mode=zero_pad,
+                    )
+                    accumulator_db = ct.mma(
+                        a_tile.transpose().astype(ct.tfloat32),
+                        dd_tile.astype(ct.tfloat32),
+                        acc=accumulator_db,
+                    )
+                ct.store(
+                    DB,
+                    index=(0, tile_id),
+                    tile=accumulator_db.transpose().astype(DB.dtype),
+                    allow_tma=False,
+                )
+        TILE_DA_SIZE_M = 128
+        TILE_DA_SIZE_K = 256
+        NUM_DA_TILES = ct.cdiv(M, TILE_DA_SIZE_M) * ct.cdiv(K, TILE_DA_SIZE_K)
+        NUM_DA_K_TILES = ct.cdiv(K, TILE_DA_SIZE_K)
+        if ct.bid(1) == 1:
+            for tile_id in range(ct.bid(0), NUM_DA_TILES, ct.num_blocks(0)):
+                b_tile_idx = tile_id % NUM_DA_K_TILES
+                dd_tile_idx = tile_id // NUM_DA_K_TILES
+                accumulator_da = ct.full((TILE_DA_SIZE_M, TILE_DA_SIZE_K), 0.0, dtype=ct.float32)
+                a_tile = ct.load(
+                    A,
+                    index=(dd_tile_idx, b_tile_idx),
+                    shape=(TILE_DA_SIZE_M, TILE_DA_SIZE_K),
+                    padding_mode=zero_pad,
+                )
+                norm_tile = ct.load(
+                    NORM, index=(dd_tile_idx, 0), shape=(TILE_DA_SIZE_M, 1), padding_mode=zero_pad
+                )
+                dr_tile = ct.load(
+                    DR, index=(dd_tile_idx, 0), shape=(TILE_DA_SIZE_M, 1), padding_mode=zero_pad
+                )
+                accumulator_da = accumulator_da + _ct_rms_dnorm(
+                    a_tile.astype(ct.float32), norm_tile, dr_tile, K
+                )
+                b_tile = ct.load(
+                    B,
+                    index=(0, b_tile_idx),
+                    shape=(TILE_N_SIZE, TILE_DA_SIZE_K),
+                    padding_mode=zero_pad,
+                )
+                dd_tile = ct.load(
+                    DD,
+                    index=(dd_tile_idx, 0),
+                    shape=(TILE_DA_SIZE_M, TILE_N_SIZE),
+                    padding_mode=zero_pad,
+                )
+                accumulator_da = ct.mma(
+                    dd_tile.astype(ct.tfloat32), b_tile.astype(ct.tfloat32), acc=accumulator_da
+                )
+                ct.store(DA, index=(dd_tile_idx, b_tile_idx), tile=accumulator_da.astype(DA.dtype))
+
+    def _next_power_of_2(n: int) -> int:
+        n -= 1
+        n |= n >> 1
+        n |= n >> 2
+        n |= n >> 4
+        n |= n >> 8
+        n |= n >> 16
+        n |= n >> 32
+        n += 1
+        return n
+
+    def _cutile_proj_rms_fwd(
+        x: Tensor, weight: Tensor, eps: float = 1e-8
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        M, K = x.shape
+        N = weight.shape[0]
+        TILE_M = 128
+        TILE_N = _next_power_of_2(N)
+        TILE_K = 128
+        num_tiles_m = math.ceil(M / TILE_M)
+        proj = torch.empty(M, N, dtype=x.dtype, device=x.device)
+        norm = torch.empty(M, 1, dtype=x.dtype, device=x.device)
+        r = torch.empty(M, 1, dtype=x.dtype, device=x.device)
+        ct.launch(
+            torch.cuda.current_stream(),
+            (num_tiles_m,),
+            _ct_proj_rms_fwd_kernel,
+            (x, weight, proj, norm, r, M, N, K, eps, TILE_M, TILE_N, TILE_K),
+        )
+        return proj, norm, r
+
+    def _cutile_proj_rms_bwd(
+        grad_proj: Tensor,
+        grad_r: Tensor,
+        x: Tensor,
+        weight: Tensor,
+        norm: Tensor,
+        eps: float = 1e-8,
+    ) -> Tuple[Tensor, Tensor]:
+        M, K = x.shape
+        N = weight.shape[0]
+        da = torch.empty_like(x)
+        db = torch.empty_like(weight)
+        TILE_SIZE_N = _next_power_of_2(N)
+        assert TILE_SIZE_N <= 256, f"TILE_SIZE_N too large: {TILE_SIZE_N}"
+        num_sms = torch.cuda.get_device_properties("cuda").multi_processor_count
+        if K >= 8192:
+            TILE_SIZE_M, TILE_SIZE_K = 128, 128
+            grid = (math.ceil(K / TILE_SIZE_K), 1)
+            ct.launch(
+                torch.cuda.current_stream(),
+                grid,
+                _ct_proj_rms_bwd_kernel,
+                (
+                    x,
+                    weight,
+                    norm,
+                    grad_proj,
+                    grad_r,
+                    da,
+                    db,
+                    M,
+                    N,
+                    K,
+                    TILE_SIZE_M,
+                    TILE_SIZE_N,
+                    TILE_SIZE_K,
+                ),
+            )
+        else:
+            grid = (num_sms, 2, 1)
+            ct.launch(
+                torch.cuda.current_stream(),
+                grid,
+                _ct_proj_rms_bwd_small_k_kernel,
+                (x, weight, norm, grad_proj, grad_r, da, db, M, N, K, TILE_SIZE_N),
+            )
+        return da, db
+
+
+# ============================================================================
+# Autograd Functions (cuTile only – guarded by _CUTILE_AVAILABLE)
+# ============================================================================
+
+if not _CUTILE_AVAILABLE:
+
+    def _no_cutile_error(*_args, **_kwargs):
+        raise RuntimeError(
+            "Fused mHC kernels require cuda.tile (cuTile) which is not installed. "
+            "Either install cuTile or set use_fused_mhc=False to use reference "
+            "implementations."
+        )
+
+    fused_sinkhorn = _no_cutile_error
+    fused_h_aggregate = _no_cutile_error
+    fused_h_post_bda = _no_cutile_error
+    fused_proj_rms = _no_cutile_error
+
+else:
+
+    class FusedSinkhornKnopp(torch.autograd.Function):
+        """Fused Sinkhorn-Knopp projection to doubly stochastic matrix (cuTile)."""
+
+        @staticmethod
+        def forward(ctx, input_logits: Tensor, num_iterations: int, eps: float = 1e-6):
+            """cuTile fused Sinkhorn forward."""
+            output, M_init = _cutile_sinkhorn_fwd(input_logits, num_iterations, eps)
+            ctx.save_for_backward(M_init)
+            ctx.num_iterations = num_iterations
+            ctx.eps = eps
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            """cuTile fused Sinkhorn backward."""
+            (M_init,) = ctx.saved_tensors
+            grad_input = _cutile_sinkhorn_bwd(grad_output, M_init, ctx.num_iterations, ctx.eps)
+            return grad_input, None, None
+
+    class FusedHAggregate(torch.autograd.Function):
+        """Fused n-stream weighted aggregation (cuTile)."""
+
+        @staticmethod
+        def forward(ctx, x: Tensor, h_pre: Tensor):
+            """cuTile fused h_aggregate forward."""
+            output = _cutile_h_aggregate_fwd(x, h_pre)
+            ctx.save_for_backward(x, h_pre)
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            """cuTile fused h_aggregate backward."""
+            x, h_pre = ctx.saved_tensors
+            return _cutile_h_aggregate_bwd(grad_output, x, h_pre)
+
+    class FusedHPostBDA(torch.autograd.Function):
+        """Fused: output = H_res @ orig_res + H_post * (x [+ bias]) (cuTile)."""
+
+        @staticmethod
+        def forward(
+            ctx,
+            h_res: Tensor,
+            original_residual: Tensor,
+            h_post: Tensor,
+            x: Tensor,
+            bias: Optional[Tensor],
+        ):
+            """cuTile fused h_post_bda forward."""
+            output = _cutile_h_post_bda_fwd(h_res, original_residual, h_post, x, bias)
+            if bias is not None:
+                ctx.save_for_backward(h_res, original_residual, h_post, x, bias)
+                ctx.has_bias = True
+            else:
+                ctx.save_for_backward(h_res, original_residual, h_post, x)
+                ctx.has_bias = False
+            return output
+
+        @staticmethod
+        def backward(ctx, grad_output):
+            """cuTile fused h_post_bda backward."""
+            if ctx.has_bias:
+                h_res, orig_res, h_post, x, bias = ctx.saved_tensors
+            else:
+                h_res, orig_res, h_post, x = ctx.saved_tensors
+                bias = None
+            return _cutile_h_post_bda_bwd(grad_output, h_res, orig_res, h_post, x, bias)
+
+    class FusedProjRms(torch.autograd.Function):
+        """Fused projection + RMS normalization (cuTile)."""
+
+        @staticmethod
+        def forward(ctx, x: Tensor, weight: Tensor, eps: float = 1e-6):
+            """cuTile fused proj_rms forward."""
+            proj, norm, r = _cutile_proj_rms_fwd(x, weight, eps)
+            ctx.save_for_backward(x, weight, norm)
+            ctx.eps = eps
+            return proj, r
+
+        @staticmethod
+        def backward(ctx, grad_proj, grad_r):
+            """cuTile fused proj_rms backward."""
+            x, weight, norm = ctx.saved_tensors
+            grad_x, grad_weight = _cutile_proj_rms_bwd(grad_proj, grad_r, x, weight, norm, ctx.eps)
+            return grad_x, grad_weight, None
+
+    # ========================================================================
+    # Public API (only available when cuTile is installed)
+    # ========================================================================
+
+    def fused_sinkhorn(input_logits: Tensor, num_iterations: int, eps: float = 1e-6) -> Tensor:
+        """Project logits to doubly stochastic matrix via Sinkhorn-Knopp.
+
+        Args:
+            input_logits: [..., n, n] raw logits
+            num_iterations: Sinkhorn iterations
+            eps: numerical stability
+
+        Returns:
+            [..., n, n] doubly stochastic matrix
+        """
+        return FusedSinkhornKnopp.apply(input_logits, num_iterations, eps)
+
+    def fused_h_aggregate(x: Tensor, h_pre: Tensor) -> Tensor:
+        """Weighted n-stream to 1-stream aggregation.
+
+        Args:
+            x: [s, b, n, C] n-stream hidden states
+            h_pre: [s, b, n] aggregation weights
+
+        Returns:
+            [s, b, C] aggregated hidden states
+        """
+        return FusedHAggregate.apply(x, h_pre)
+
+    def fused_h_post_bda(
+        h_res: Tensor, original_residual: Tensor, h_post: Tensor, x: Tensor, bias: Optional[Tensor]
+    ) -> Tensor:
+        """Fused H_res @ residual + H_post * (x + bias).
+
+        Args:
+            h_res: [s, b, n, n] residual mixing matrix
+            original_residual: [s, b, n, C] n-stream residual
+            h_post: [s, b, n] expansion weights
+            x: [s, b, C] layer output
+            bias: [C] or None
+
+        Returns:
+            [s, b, n, C] fused output
+        """
+        return FusedHPostBDA.apply(h_res, original_residual, h_post, x, bias)
+
+    def fused_proj_rms(x: Tensor, weight: Tensor, eps: float = 1e-6) -> Tuple[Tensor, Tensor]:
+        """Fused projection + RMS normalization.
+
+        Args:
+            x: [M, K] input
+            weight: [N, K] projection weight
+            eps: stability epsilon
+
+        Returns:
+            proj: [M, N] = x @ weight^T
+            r: [M, 1] = 1 / (||x|| / sqrt(K) + eps)
+        """
+        return FusedProjRms.apply(x, weight, eps)
diff --git a/megatron/core/fusions/fused_mla_yarn_rope_apply.py b/megatron/core/fusions/fused_mla_yarn_rope_apply.py
index 1fd5dcfae37..6eed7581d03 100644
--- a/megatron/core/fusions/fused_mla_yarn_rope_apply.py
+++ b/megatron/core/fusions/fused_mla_yarn_rope_apply.py
@@ -65,11 +65,11 @@ def _get_thd_token_idx(cu_seqlens, pid_m, seq_num, cp_rank, cp_size):
     restore_value=["Q"],
 )
 @triton.jit
-def rotary_fwd_q_kernel(
+def _mla_rope_fwd_inplace_kernel(
     Q,
     COS,
     SIN,
-    qk_head_dim,
+    nope_dim,
     emb_dim: tl.constexpr,
     head_num: tl.constexpr,
     batch_size,
@@ -77,17 +77,21 @@ def rotary_fwd_q_kernel(
     cu_seqlens_q,
     stride_x_seq,
     stride_x_nheads,
+    stride_cos_seq,
+    stride_sin_seq,
     cp_rank,
     cp_size,
+    INVERSE: tl.constexpr,
+    REMOVE_INTERLEAVING: tl.constexpr,
     BLOCK_H: tl.constexpr,
 ):
     """
-    Triton kernel of the forward pass for applying YARN RoPE to MLA's query.
-    This kernel inplace modifies the input tensor Q.
+    Forward pass: apply RoPE inplace to the trailing emb_dim elements.
+    Reads from interleaved layout, writes back to interleaved layout.
 
     Input:
-        Q: [seq_len, batch_size, head_num, qk_head_dim + emb_dim]
-            or [total_seq_len, head_num, qk_head_dim + emb_dim]
+        Q: [seq_len, batch_size, head_num, nope_dim + emb_dim]
+            or [total_seq_len, head_num, nope_dim + emb_dim]
         COS/SIN: [max_seq_len, emb_dim]
 
         batch_size: batch size for sbhd format, not used for thd format
@@ -102,10 +106,17 @@ def rotary_fwd_q_kernel(
     else:
         token_idx = _get_thd_token_idx(cu_seqlens_q, pid_m, seq_num, cp_rank, cp_size)
 
-    cos_left = tl.load(COS + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
-    sin_left = tl.load(SIN + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
-    cos_right = tl.load(COS + token_idx * emb_dim + emb_dim // 2 + tl.arange(0, emb_dim // 2))
-    sin_right = tl.load(SIN + token_idx * emb_dim + emb_dim // 2 + tl.arange(0, emb_dim // 2))
+    cos_left = tl.load(COS + token_idx * stride_cos_seq + tl.arange(0, emb_dim // 2))
+    sin_left = tl.load(SIN + token_idx * stride_sin_seq + tl.arange(0, emb_dim // 2))
+    cos_right = tl.load(
+        COS + token_idx * stride_cos_seq + emb_dim // 2 + tl.arange(0, emb_dim // 2)
+    )
+    sin_right = tl.load(
+        SIN + token_idx * stride_sin_seq + emb_dim // 2 + tl.arange(0, emb_dim // 2)
+    )
+    if INVERSE:
+        sin_left = -sin_left
+        sin_right = -sin_right
     cos_left = cos_left.expand_dims(0).broadcast_to(BLOCK_H, emb_dim // 2)
     sin_left = sin_left.expand_dims(0).broadcast_to(BLOCK_H, emb_dim // 2)
     cos_right = cos_right.expand_dims(0).broadcast_to(BLOCK_H, emb_dim // 2)
@@ -113,7 +124,7 @@ def rotary_fwd_q_kernel(
 
     Q = Q + pid_m * stride_x_seq + pid_head * BLOCK_H * stride_x_nheads
 
-    x_off = tl.arange(0, BLOCK_H)[:, None] * stride_x_nheads + qk_head_dim
+    x_off = tl.arange(0, BLOCK_H)[:, None] * stride_x_nheads + nope_dim
     mask = x_off < head_num * stride_x_nheads
     # x1 = t[..., 0::2], x2 = t[..., 1::2]
     x_1_off = x_off + tl.arange(0, emb_dim // 2)[None, :] * 2
@@ -124,10 +135,14 @@ def rotary_fwd_q_kernel(
     x_left = x_1 * cos_left - x_2 * sin_left
     x_right = x_2 * cos_right + x_1 * sin_right
 
-    x_left_off = x_off + tl.arange(0, emb_dim // 2)[None, :]
-    x_right_off = x_left_off + emb_dim // 2
-    tl.store(Q + x_left_off, x_left, mask=mask)
-    tl.store(Q + x_right_off, x_right, mask=mask)
+    if REMOVE_INTERLEAVING:
+        tl.store(Q + x_1_off, x_left, mask=mask)
+        tl.store(Q + x_2_off, x_right, mask=mask)
+    else:
+        x_left_off = x_off + tl.arange(0, emb_dim // 2)[None, :]
+        x_right_off = x_left_off + emb_dim // 2
+        tl.store(Q + x_left_off, x_left, mask=mask)
+        tl.store(Q + x_right_off, x_right, mask=mask)
 
 
 @triton.autotune(
@@ -145,11 +160,11 @@ def rotary_fwd_q_kernel(
     restore_value=["DO"],
 )
 @triton.jit
-def rotary_bwd_q_kernel(
+def _mla_rope_bwd_inplace_kernel(
     DO,
     COS,
     SIN,
-    qk_head_dim,
+    nope_dim,
     emb_dim: tl.constexpr,
     head_num: tl.constexpr,
     batch_size,
@@ -157,17 +172,21 @@ def rotary_bwd_q_kernel(
     cu_seqlens_q,
     stride_x_seq,
     stride_x_nheads,
+    stride_cos_seq,
+    stride_sin_seq,
     cp_rank,
     cp_size,
+    INVERSE: tl.constexpr,
+    REMOVE_INTERLEAVING: tl.constexpr,
     BLOCK_H: tl.constexpr,
 ):
     """
-    Triton kernel of the backward pass for applying YARN RoPE to MLA's query.
-    This kernel inplace modifies the input tensor DO.
+    Backward pass: inverse RoPE inplace on the trailing emb_dim elements.
+    Reads from interleaved layout, writes to interleaved layout.
 
     Input:
-        DO: [seq_len, batch_size, head_num, qk_head_dim + emb_dim]
-            or [total_seq_len, head_num, qk_head_dim + emb_dim]
+        DO: [seq_len, batch_size, head_num, nope_dim + emb_dim]
+            or [total_seq_len, head_num, nope_dim + emb_dim]
         COS/SIN: [max_seq_len, emb_dim]
 
         batch_size, seq_num, and cu_seqlens_q are the same as in the forward pass
@@ -180,10 +199,17 @@ def rotary_bwd_q_kernel(
     else:
         token_idx = _get_thd_token_idx(cu_seqlens_q, pid_m, seq_num, cp_rank, cp_size)
 
-    cos_left = tl.load(COS + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
-    sin_left = tl.load(SIN + token_idx * emb_dim + tl.arange(0, emb_dim // 2))
-    cos_right = tl.load(COS + token_idx * emb_dim + emb_dim // 2 + tl.arange(0, emb_dim // 2))
-    sin_right = tl.load(SIN + token_idx * emb_dim + emb_dim // 2 + tl.arange(0, emb_dim // 2))
+    cos_left = tl.load(COS + token_idx * stride_cos_seq + tl.arange(0, emb_dim // 2))
+    sin_left = tl.load(SIN + token_idx * stride_sin_seq + tl.arange(0, emb_dim // 2))
+    cos_right = tl.load(
+        COS + token_idx * stride_cos_seq + emb_dim // 2 + tl.arange(0, emb_dim // 2)
+    )
+    sin_right = tl.load(
+        SIN + token_idx * stride_sin_seq + emb_dim // 2 + tl.arange(0, emb_dim // 2)
+    )
+    if INVERSE:
+        sin_left = -sin_left
+        sin_right = -sin_right
     cos_left = cos_left.expand_dims(0).broadcast_to(BLOCK_H, emb_dim // 2)
     sin_left = sin_left.expand_dims(0).broadcast_to(BLOCK_H, emb_dim // 2)
     cos_right = cos_right.expand_dims(0).broadcast_to(BLOCK_H, emb_dim // 2)
@@ -191,25 +217,32 @@ def rotary_bwd_q_kernel(
 
     DO = DO + pid_m * stride_x_seq + pid_head * BLOCK_H * stride_x_nheads
 
-    x_off = tl.arange(0, BLOCK_H)[:, None] * stride_x_nheads + qk_head_dim
+    x_off = tl.arange(0, BLOCK_H)[:, None] * stride_x_nheads + nope_dim
     mask = x_off < head_num * stride_x_nheads
-    x_left_off = x_off + tl.arange(0, emb_dim // 2)[None, :]
-    x_right_off = x_left_off + emb_dim // 2
-    x_left = tl.load(DO + x_left_off, mask=mask)
-    x_right = tl.load(DO + x_right_off, mask=mask)
+    if REMOVE_INTERLEAVING:
+        x_1_off = x_off + tl.arange(0, emb_dim // 2)[None, :] * 2
+        x_2_off = x_1_off + 1
+        x_left = tl.load(DO + x_1_off, mask=mask)
+        x_right = tl.load(DO + x_2_off, mask=mask)
+    else:
+        x_left_off = x_off + tl.arange(0, emb_dim // 2)[None, :]
+        x_right_off = x_left_off + emb_dim // 2
+        x_left = tl.load(DO + x_left_off, mask=mask)
+        x_right = tl.load(DO + x_right_off, mask=mask)
+        x_1_off = x_off + tl.arange(0, emb_dim // 2)[None, :] * 2
+        x_2_off = x_1_off + 1
 
     x_1 = x_left * cos_left + x_right * sin_right
     x_2 = -x_left * sin_left + x_right * cos_right
 
-    x_1_off = x_off + tl.arange(0, emb_dim // 2)[None, :] * 2
-    x_2_off = x_1_off + 1
     tl.store(DO + x_1_off, x_1, mask=mask)
     tl.store(DO + x_2_off, x_2, mask=mask)
 
 
-class ApplyMLARotaryEmbQ(torch.autograd.Function):
+class _FusedMLARoPEInplace(torch.autograd.Function):
     """
-    Autograd function for applying YARN RoPE to MLA's query.
+    Autograd function for applying RoPE inplace to the trailing emb_dim
+    elements of a multi-head tensor (leaving the first nope_dim elements unchanged).
     """
 
     @staticmethod
@@ -218,22 +251,25 @@ def forward(
         q,
         cos,
         sin,
-        qk_head_dim,
+        nope_dim,
         emb_dim,
         cu_seqlens_q,
         cp_rank,
         cp_size,
         rotary_interleaved=False,
+        inverse=False,
+        remove_interleaving=False,
     ):
         """
-        Forward function for ApplyMLARotaryEmbQ.
+        Forward function for _FusedMLARoPEInplace.
 
         Args:
-            q: [seq_len, batch_size, head_num, qk_head_dim + emb_dim]
-                or [total_seq_len, head_num, qk_head_dim + emb_dim]
+            q: [seq_len, batch_size, head_num, nope_dim + emb_dim]
+                or [total_seq_len, head_num, nope_dim + emb_dim]
             cos/sin: [max_seq_len, 1, 1, emb_dim]
             cu_seqlens_q: [seq_num + 1] accumulated sequence lengths for thd format
             rotary_interleaved: whether to apply RoPE interleaved, only supports False for now
+            inverse: if True, negate sin inside the kernel to apply the inverse rotation
         """
         assert not rotary_interleaved
         max_seqlen = None
@@ -249,17 +285,17 @@ def forward(
             total_seqlen, nheads, headdim = q.shape
             seq_num = len(cu_seqlens_q) - 1
         assert q.stride(-1) == 1
-        assert cos.is_contiguous()
-        assert sin.is_contiguous()
-        assert headdim == qk_head_dim + emb_dim
+        assert cos.stride(-1) == 1
+        assert sin.stride(-1) == 1
+        assert headdim == nope_dim + emb_dim
         assert emb_dim % 4 == 0
 
         grid = lambda META: (total_seqlen, triton.cdiv(nheads, META["BLOCK_H"]))
-        rotary_fwd_q_kernel[grid](
+        _mla_rope_fwd_inplace_kernel[grid](
             q,
             cos,
             sin,
-            qk_head_dim,
+            nope_dim,
             emb_dim,
             nheads,
             batch_size,
@@ -267,14 +303,20 @@ def forward(
             cu_seqlens_q,
             q.stride(0),
             q.stride(1),
+            cos.stride(0),
+            sin.stride(0),
             cp_rank,
             cp_size,
+            INVERSE=inverse,
+            REMOVE_INTERLEAVING=remove_interleaving,
         )
         ctx.save_for_backward(cos, sin)
-        ctx.qk_head_dim = qk_head_dim
+        ctx.nope_dim = nope_dim
         ctx.emb_dim = emb_dim
         ctx.cu_seqlens_q = cu_seqlens_q
         ctx.rotary_interleaved = rotary_interleaved
+        ctx.inverse = inverse
+        ctx.remove_interleaving = remove_interleaving
         ctx.cp_rank = cp_rank
         ctx.cp_size = cp_size
         if cu_seqlens_q is None:
@@ -284,11 +326,11 @@ def forward(
     @staticmethod
     def backward(ctx, grad):
         """
-        Backward function for ApplyMLARotaryEmbQ.
+        Backward function for _FusedMLARoPEInplace.
 
         Args:
-            grad: [seq_len, batch_size, head_num, qk_head_dim + emb_dim]
-                or [total_seq_len, head_num, qk_head_dim + emb_dim]
+            grad: [seq_len, batch_size, head_num, nope_dim + emb_dim]
+                or [total_seq_len, head_num, nope_dim + emb_dim]
         """
         cos, sin = ctx.saved_tensors
         max_seqlen = None
@@ -304,11 +346,11 @@ def backward(ctx, grad):
         assert grad.stride(-1) == 1
 
         grid = lambda META: (total_seqlen, triton.cdiv(nheads, META["BLOCK_H"]))
-        rotary_bwd_q_kernel[grid](
+        _mla_rope_bwd_inplace_kernel[grid](
             grad,
             cos,
             sin,
-            ctx.qk_head_dim,
+            ctx.nope_dim,
             ctx.emb_dim,
             nheads,
             batch_size,
@@ -316,49 +358,67 @@ def backward(ctx, grad):
             ctx.cu_seqlens_q,
             grad.stride(0),
             grad.stride(1),
+            cos.stride(0),
+            sin.stride(0),
             ctx.cp_rank,
             ctx.cp_size,
+            INVERSE=ctx.inverse,
+            REMOVE_INTERLEAVING=ctx.remove_interleaving,
         )
         if ctx.cu_seqlens_q is None:
             grad = grad.view(max_seqlen, batch_size, nheads, headdim)
-        return grad, None, None, None, None, None, None, None, None
+        return grad, None, None, None, None, None, None, None, None, None, None
 
 
-def fused_apply_mla_rope_for_q(
+def fused_mla_rope_inplace(
     t: torch.Tensor,
     cos: torch.Tensor,
     sin: torch.Tensor,
-    qk_head_dim: int,
+    nope_dim: int,
     emb_dim: int,
     cu_seqlens_q: Optional[torch.Tensor] = None,
     cp_rank: int = 0,
     cp_size: int = 1,
     rotary_interleaved: bool = False,
+    inverse: bool = False,
+    remove_interleaving: bool = False,
 ):
     """
-    Fused function for applying YARN RoPE to MLA's query.
-    This function inplace modifies the input tensor t.
-    Along the last dimension of t, the last emb_dim elements are applied with RoPE.
-    The first qk_head_dim elements are not modified.
-    It is an experimental feature and may change in future versions.
+    Fused RoPE applied inplace to the trailing emb_dim elements of a tensor,
+    leaving the first nope_dim elements unchanged.
     It supports both sbhd and thd input formats.
 
+    When ``inverse=True`` the rotation is reversed, which is useful for
+    undoing RoPE on the attention output.
+
     For the notations below, seq_len is the length of the sequence per batch for sbhd format,
     total_seq_len is the total length of the sequences for thd format.
     max_seq_len is the maximum length of the sequences in the input tensor.
 
     Args:
-        t: [seq_len, batch_size, head_num, qk_head_dim + emb_dim]
-            or [total_seq_len, head_num, qk_head_dim + emb_dim]
+        t: [seq_len, batch_size, head_num, nope_dim + emb_dim]
+            or [total_seq_len, head_num, nope_dim + emb_dim]
         cos/sin: [max_seq_len, 1, 1, emb_dim]
         cu_seqlens_q: [seq_num + 1] accumulated sequence lengths for thd format
         rotary_interleaved: whether to apply RoPE interleaved, only supports False for now
+        inverse: if True, apply the inverse rotation
+        remove_interleaving: if True, output RoPE dims in non-interleaved layout
 
     Returns:
         t: inplace modified input tensor
     """
-    return ApplyMLARotaryEmbQ.apply(
-        t, cos, sin, qk_head_dim, emb_dim, cu_seqlens_q, cp_rank, cp_size, rotary_interleaved
+    return _FusedMLARoPEInplace.apply(
+        t,
+        cos,
+        sin,
+        nope_dim,
+        emb_dim,
+        cu_seqlens_q,
+        cp_rank,
+        cp_size,
+        rotary_interleaved,
+        inverse,
+        remove_interleaving,
     )
 
 
@@ -376,7 +436,7 @@ def fused_apply_mla_rope_for_q(
     key=["emb_dim", "k_dim", "v_dim", "head_num"],
 )
 @triton.jit
-def rotary_fwd_kv_kernel(
+def _mla_rope_fwd_kv_split_kernel(
     KV,
     K_POS_EMB,
     O_KEY,
@@ -399,12 +459,12 @@ def rotary_fwd_kv_kernel(
     stride_v_nheads,
     cp_rank,
     cp_size,
+    REMOVE_INTERLEAVING: tl.constexpr,
     BLOCK_H: tl.constexpr,
 ):
     """
-    Triton kernel of the forward pass for applying YARN RoPE to MLA's key and value.
-    It splits the input tensor KV into key and value,
-    and concatenates the processed RoPE to the key.
+    Forward pass: split KV into key and value, apply RoPE to k_pos_emb,
+    and concatenate the result onto key.
 
     Input:
         KV: [seq_len, batch_size, head_num, k_dim + v_dim]
@@ -460,14 +520,24 @@ def rotary_fwd_kv_kernel(
     x_left = x_left.expand_dims(0).broadcast_to(BLOCK_H, emb_dim // 2)
     x_right = x_right.expand_dims(0).broadcast_to(BLOCK_H, emb_dim // 2)
 
-    x_left_off = (
-        tl.arange(0, BLOCK_H)[:, None] * stride_k_nheads
-        + k_dim
-        + tl.arange(0, emb_dim // 2)[None, :]
-    )
-    x_right_off = x_left_off + emb_dim // 2
-    tl.store(K_ptr + x_left_off, x_left, mask=mask)
-    tl.store(K_ptr + x_right_off, x_right, mask=mask)
+    if REMOVE_INTERLEAVING:
+        x_1_off = (
+            tl.arange(0, BLOCK_H)[:, None] * stride_k_nheads
+            + k_dim
+            + tl.arange(0, emb_dim // 2)[None, :] * 2
+        )
+        x_2_off = x_1_off + 1
+        tl.store(K_ptr + x_1_off, x_left, mask=mask)
+        tl.store(K_ptr + x_2_off, x_right, mask=mask)
+    else:
+        x_left_off = (
+            tl.arange(0, BLOCK_H)[:, None] * stride_k_nheads
+            + k_dim
+            + tl.arange(0, emb_dim // 2)[None, :]
+        )
+        x_right_off = x_left_off + emb_dim // 2
+        tl.store(K_ptr + x_left_off, x_left, mask=mask)
+        tl.store(K_ptr + x_right_off, x_right, mask=mask)
 
 
 @triton.autotune(
@@ -484,7 +554,7 @@ def rotary_fwd_kv_kernel(
     key=["emb_dim", "k_dim", "v_dim", "head_num"],
 )
 @triton.jit
-def rotary_bwd_kv_kernel(
+def _mla_rope_bwd_kv_split_kernel(
     dK,
     dV,
     dKV,
@@ -507,10 +577,11 @@ def rotary_bwd_kv_kernel(
     stride_demb_seq,
     cp_rank,
     cp_size,
+    REMOVE_INTERLEAVING: tl.constexpr,
     BLOCK_H: tl.constexpr,
 ):
     """
-    Triton kernel of the backward pass for applying YARN RoPE to MLA's key and value.
+    Backward pass for the KV-split RoPE.
 
     Input:
         dK: [seq_len, batch_size, head_num, emb_dim + k_dim]
@@ -555,10 +626,16 @@ def rotary_bwd_kv_kernel(
             dK_ptr = dK + pid_m * stride_dk_seq + i * BLOCK_H * stride_dk_nheads
             x_off = tl.arange(0, BLOCK_H)[:, None] * stride_dk_nheads + k_dim
             mask = x_off < head_num * stride_dk_nheads
-            x_left_off = x_off + tl.arange(0, emb_dim // 2)[None, :]
-            x_right_off = x_left_off + emb_dim // 2
-            x_left = tl.load(dK_ptr + x_left_off, mask=mask)
-            x_right = tl.load(dK_ptr + x_right_off, mask=mask)
+            if REMOVE_INTERLEAVING:
+                x_1_off = x_off + tl.arange(0, emb_dim // 2)[None, :] * 2
+                x_2_off = x_1_off + 1
+                x_left = tl.load(dK_ptr + x_1_off, mask=mask)
+                x_right = tl.load(dK_ptr + x_2_off, mask=mask)
+            else:
+                x_left_off = x_off + tl.arange(0, emb_dim // 2)[None, :]
+                x_right_off = x_left_off + emb_dim // 2
+                x_left = tl.load(dK_ptr + x_left_off, mask=mask)
+                x_right = tl.load(dK_ptr + x_right_off, mask=mask)
             x_left_accum += x_left
             x_right_accum += x_right
         x_left_accum = tl.sum(x_left_accum, axis=0)
@@ -578,9 +655,10 @@ def rotary_bwd_kv_kernel(
         tl.store(dEMB_ptr + tl.arange(0, emb_dim // 2) * 2 + 1, x_2)
 
 
-class ApplyMLARotaryEmbKV(torch.autograd.Function):
+class _FusedMLARoPEKVSplit(torch.autograd.Function):
     """
-    Autograd function for applying YARN RoPE to MLA's key and value.
+    Autograd function for applying RoPE to MLA's key and value.
+    Splits KV, applies RoPE to k_pos_emb, concatenates onto key.
     """
 
     @staticmethod
@@ -597,9 +675,10 @@ def forward(
         cp_rank,
         cp_size,
         rotary_interleaved=False,
+        remove_interleaving=False,
     ):
         """
-        Forward function for ApplyMLARotaryEmbKV.
+        Forward function for _FusedMLARoPEKVSplit.
 
         Args:
             kv: [seq_len, batch_size, head_num, k_dim + v_dim]
@@ -634,7 +713,7 @@ def forward(
         o_value = kv.new_empty(total_seqlen, nheads, v_dim)
 
         grid = lambda META: (total_seqlen, triton.cdiv(nheads, META["BLOCK_H"]))
-        rotary_fwd_kv_kernel[grid](
+        _mla_rope_fwd_kv_split_kernel[grid](
             kv,
             k_pos_emb,
             o_key,
@@ -657,8 +736,10 @@ def forward(
             o_value.stride(1),
             cp_rank,
             cp_size,
+            REMOVE_INTERLEAVING=remove_interleaving,
         )
         ctx.save_for_backward(cos, sin)
+        ctx.remove_interleaving = remove_interleaving
         ctx.rotary_interleaved = rotary_interleaved
         ctx.emb_dim = emb_dim
         ctx.k_dim = k_dim
@@ -674,7 +755,7 @@ def forward(
     @staticmethod
     def backward(ctx, dk, dv):
         """
-        Backward function for ApplyMLARotaryEmbKV.
+        Backward function for _FusedMLARoPEKVSplit.
 
         Args:
             dk: [seq_len, batch_size, head_num, emb_dim + k_dim]
@@ -702,7 +783,7 @@ def backward(ctx, dk, dv):
         d_emb = dk.new_empty(total_seqlen, 1, ctx.emb_dim)
 
         grid = lambda META: (total_seqlen, triton.cdiv(nheads, META["BLOCK_H"]))
-        rotary_bwd_kv_kernel[grid](
+        _mla_rope_bwd_kv_split_kernel[grid](
             dk,
             dv,
             d_kv,
@@ -725,14 +806,15 @@ def backward(ctx, dk, dv):
             d_emb.stride(0),
             ctx.cp_rank,
             ctx.cp_size,
+            REMOVE_INTERLEAVING=ctx.remove_interleaving,
         )
         if ctx.cu_seqlens_kv is None:
             d_kv = d_kv.view(max_seqlen, batch_size, nheads, ctx.k_dim + ctx.v_dim)
             d_emb = d_emb.view(max_seqlen, batch_size, 1, ctx.emb_dim)
-        return d_kv, d_emb, None, None, None, None, None, None, None, None, None
+        return d_kv, d_emb, None, None, None, None, None, None, None, None, None, None
 
 
-def fused_apply_mla_rope_for_kv(
+def fused_mla_rope_kv_split(
     kv: torch.Tensor,
     k_pos_emb: torch.Tensor,
     cos: torch.Tensor,
@@ -744,9 +826,10 @@ def fused_apply_mla_rope_for_kv(
     cp_rank: int = 0,
     cp_size: int = 1,
     rotary_interleaved: bool = False,
+    remove_interleaving: bool = False,
 ):
     """
-    Fused function for applying YARN RoPE to MLA's key and value.
+    Fused function for applying RoPE to MLA's key and value.
     It splits the input tensor kv into key and value,
     and concatenates the processed RoPE to the key.
 
@@ -761,13 +844,14 @@ def fused_apply_mla_rope_for_kv(
         cos/sin: [max_seq_len, 1, 1, emb_dim]
         cu_seqlens_kv: [seq_num + 1] accumulated sequence lengths for thd format
         rotary_interleaved: whether to apply RoPE interleaved, only supports False for now
+        remove_interleaving: if True, output RoPE dims in non-interleaved layout
 
     Returns:
         key: [seq_len, batch_size, head_num, emb_dim + k_dim]
             or [total_seq_len, head_num, emb_dim + k_dim]
         value: [seq_len, batch_size, head_num, v_dim] or [total_seq_len, head_num, v_dim]
     """
-    return ApplyMLARotaryEmbKV.apply(
+    return _FusedMLARoPEKVSplit.apply(
         kv,
         k_pos_emb,
         cos,
@@ -779,4 +863,12 @@ def fused_apply_mla_rope_for_kv(
         cp_rank,
         cp_size,
         rotary_interleaved,
+        remove_interleaving,
     )
+
+
+# ---------------------------------------------------------------------------
+# Backward-compatible aliases (deprecated, prefer the new names above)
+# ---------------------------------------------------------------------------
+fused_apply_mla_rope_for_q = fused_mla_rope_inplace
+fused_apply_mla_rope_for_kv = fused_mla_rope_kv_split
diff --git a/megatron/core/fusions/linear_cross_entropy/__init__.py b/megatron/core/fusions/linear_cross_entropy/__init__.py
new file mode 100644
index 00000000000..b9a9591fa69
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py b/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py
new file mode 100644
index 00000000000..b9a9591fa69
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
new file mode 100644
index 00000000000..3178e8c6909
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/bwd_partial_dlogits.py
@@ -0,0 +1,667 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+import logging
+from typing import Optional, Tuple, Type
+
+try:
+    import cuda.bindings.driver as cuda  # type: ignore
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.pipeline as pipeline  # type: ignore
+    import cutlass.utils as utils  # type: ignore
+    import cutlass.utils.blackwell_helpers as sm100_utils  # type: ignore
+    from cutlass.cute.nvgpu import cpasync, tcgen05
+
+    SM100_TMEM_CAPACITY_COLUMNS: int = 512
+
+    def make_thread_cooperative_group(size: int, alignment: Optional[int] = None):
+        """
+        Create a thread cooperative group.
+        """
+        return pipeline.CooperativeGroup(
+            pipeline.Agent.Thread, size, alignment=alignment if alignment is not None else size
+        )
+
+    class BwdPartialDlogits:
+        """
+        This class implements the backward kernel for partial d_logits.
+        """
+
+        def __init__(
+            self,
+            reduction: int,
+            acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+            use_2cta_instrs: bool = False,
+            mma_tiler_mn: Tuple[int, int] = (128, 256),
+            vocab_per_split: int = 512,
+        ):
+            self.REDUCTION: cutlass.Constexpr[cutlass.Int32] = cutlass.const_expr(reduction)
+            self.acc_dtype = acc_dtype
+            self.use_2cta_instrs = use_2cta_instrs
+            self.mma_tiler = (*mma_tiler_mn, 1)
+            self.vocab_per_split = vocab_per_split
+
+            self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+            self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
+
+            self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+
+            self.threads_per_warp: int = 32
+
+            self.epi_warp_ids = (0, 1, 2, 3)
+            self.load_warp_ids = 4
+            self.mma_warp_ids = 5
+            self.empty_warp_ids = (6, 7)
+
+            self.threads_per_cta: int = self.threads_per_warp * len(
+                (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids)
+            )
+            self.cta_sync_barrier = pipeline.NamedBarrier(
+                barrier_id=1, num_threads=self.threads_per_cta
+            )
+
+            self.buffer_align_bytes: int = 1024
+            self.num_regs_other: int = 32
+            self.num_regs_epi: int = 192
+
+        def _compute_grid(
+            self,
+            problem_mnk: Tuple[int, int, int],
+            cluster_shape_mn: Tuple[int, int],
+            cta_tiler: Tuple[int, int, int],
+        ) -> Tuple[int, int, int]:
+            cluster_shape_mnk = (*cluster_shape_mn, 1)
+
+            grid = cute.round_up(
+                (
+                    cute.ceil_div(problem_mnk[0], cta_tiler[0]),
+                    cute.ceil_div(self.vocab_per_split, cta_tiler[1]),
+                    1,
+                ),
+                cluster_shape_mnk,
+            )
+            return grid
+
+        def _compute_stages(
+            self,
+            tiled_mma: cute.TiledMma,
+            mma_tiler: Tuple[int, int, int],
+            a_dtype: Type[cutlass.Numeric],
+            b_dtype: Type[cutlass.Numeric],
+        ):
+            num_acc_stage = 1
+            num_ab_stage = 4
+            num_epi_stage_per_tile = 4
+            return num_acc_stage, num_ab_stage, num_epi_stage_per_tile
+
+        def _setup_attributes(
+            self,
+            tiled_mma: cute.TiledMma,
+            a_dtype: Type[cutlass.Numeric],
+            b_dtype: Type[cutlass.Numeric],
+        ):
+            self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+            self.cluster_layout_vmnk = cute.tiled_divide(
+                cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,)
+            )
+
+            mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+            # it requires k-mode to be 128B aligned
+            mma_inst_tile_k: int = 4
+            self.mma_tiler = (
+                self.mma_tiler[0],
+                self.mma_tiler[1],
+                mma_inst_shape_k * mma_inst_tile_k,
+            )
+
+            self.num_acc_stage, self.num_ab_stage, self.num_epi_stage_per_tile = (
+                self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
+            )
+            self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
+            assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
+
+            self.cta_tile_shape_mnk = (
+                self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+                self.mma_tiler[1],
+                self.mma_tiler[2],
+            )
+
+        @cute.kernel
+        def kernel(
+            self,
+            split_idx: cutlass.Int32,
+            tiled_mma: cute.TiledMma,
+            tma_atom_a: cute.CopyAtom,
+            mA: cute.Tensor,
+            tma_atom_b: cute.CopyAtom,
+            mB: cute.Tensor,
+            mLabels: cute.Tensor,
+            mDlogprobs: cute.Tensor,
+            mMaximum: cute.Tensor,
+            mAccu: cute.Tensor,
+            mDlogits_partial: cute.Tensor,
+            scalarNumValidTokens: cute.Pointer,
+            ignore_index: cutlass.Int64,
+            a_smem_layout_staged: cute.ComposedLayout,
+            b_smem_layout_staged: cute.ComposedLayout,
+            cluster_layout_vmnk: cute.Layout,
+            problem_mnk: Tuple[int, int, int],
+            rank: cutlass.Int32,
+        ) -> None:
+            """
+            The backward kernel for partial d_logits.
+            """
+            warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+            tidx, _, _ = cute.arch.thread_idx()
+            bidx, bidy, _ = cute.arch.block_idx()
+            # FIXME: block swizzling applied here
+            pidm, pidn = bidx, bidy
+
+            # FIXME: if 2 CTAs, modify here
+            cta_rank_in_cluster = 0
+            block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+
+            # prefetch tma descriptors
+            if warp_idx == self.load_warp_ids:
+                cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a)
+                cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b)
+
+            smem = utils.SmemAllocator()
+            storage = smem.allocate(self.shared_storage)
+
+            ab_pipeline = pipeline.PipelineTmaUmma.create(
+                num_stages=self.num_ab_stage,
+                producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
+                consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+                tx_count=self.tma_copy_ab_bytes,
+                barrier_storage=storage.load_ab_mbar_ptr.data_ptr(),
+            )
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_ab_stage
+            )
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_ab_stage
+            )
+
+            mma_pipeline = pipeline.PipelineUmmaAsync.create(
+                num_stages=self.num_acc_stage,
+                producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+                consumer_group=make_thread_cooperative_group(
+                    self.threads_per_warp * len(self.epi_warp_ids)
+                ),
+                barrier_storage=storage.mma_mbar_ptr.data_ptr(),
+            )
+            mma_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+            mma_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
+            if warp_idx == self.empty_warp_ids[0]:
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init(
+                        tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids)
+                    )
+                    cute.arch.mbarrier_init_fence()
+
+            # -------- tensor partition ------------ #
+            # swizzle o [(tileM, tileK), loopM, loopK, stage]
+            sA = storage.sA.get_tensor(
+                a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+            )
+            # swizzle o [(tileN, tileK), loopN, loopK, stage]
+            sB = storage.sB.get_tensor(
+                b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+            )
+
+            # FIXME: if 2 CTAs, modify here
+            thr_mma = tiled_mma.get_slice(0)
+            # [MMA, loopM, loopK, stage]
+            tCsA = thr_mma.make_fragment_A(sA)
+            # [MMA, loopN, loopK, stage]
+            tCsB = thr_mma.make_fragment_B(sB)
+
+            # [tileM, tileK, loopK]
+            gA = cute.local_tile(
+                mA, (self.cta_tile_shape_mnk[0], self.cta_tile_shape_mnk[2]), (pidm, None)
+            )
+            # [vocab_per_split, dim]
+            mB_n = cute.local_tile(
+                mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (split_idx, 0)
+            )
+            # [tileN, tileK, loopK]
+            gB = cute.local_tile(
+                mB_n, (self.cta_tile_shape_mnk[1], self.cta_tile_shape_mnk[2]), (pidn, None)
+            )
+
+            a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+            # just to make sure SMEM and GMEM tensor has the same size in the first rank
+            tCgA = thr_mma.partition_A(gA)
+            tCgB = thr_mma.partition_B(gB)
+            # [CPY, stage] & [CPY, loopK]
+            tTMAsA, tTMAgA = cpasync.tma_partition(
+                tma_atom_a,
+                block_in_cluster_coord_vmnk[2],  # cta_coord,
+                a_cta_layout,
+                cute.group_modes(sA, 0, 3),
+                cute.group_modes(tCgA, 0, 3),
+            )
+            b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+            # [CPY, stage] & [CPY, loopK]
+            tTMAsB, tTMAgB = cpasync.tma_partition(
+                tma_atom_b,
+                block_in_cluster_coord_vmnk[1],  # cta_coord
+                b_cta_layout,
+                cute.group_modes(sB, 0, 3),
+                cute.group_modes(tCgB, 0, 3),
+            )
+
+            # ------ Allocate TMEM ------ #
+            tmem_holding_buf = storage.tmem_holding_buf
+            if warp_idx == self.empty_warp_ids[0]:
+                cute.arch.alloc_tmem(
+                    self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs
+                )
+            self.cta_sync_barrier.arrive_and_wait()
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
+            )
+
+            tmem_shape = (128, self.tmem_alloc_cols)
+            acc_shape = thr_mma.partition_shape_C(tmem_shape)
+            tCtC_fake = thr_mma.make_fragment_C(acc_shape)
+            # [(tileM, tileN), loopM, loopN]
+            tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
+
+            # ------ Empty ------ #
+            if warp_idx in self.empty_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            # ------ Load ------ #
+            if warp_idx == self.load_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                for k in cutlass.range(cute.size(gA, mode=[2])):
+                    ab_pipeline.producer_acquire(ab_producer_state)
+                    cute.copy(
+                        tma_atom_a,
+                        tTMAgA[(None, k)],
+                        tTMAsA[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                    )
+                    cute.copy(
+                        tma_atom_b,
+                        tTMAgB[(None, k)],
+                        tTMAsB[(None, ab_producer_state.index)],
+                        tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                    )
+                    ab_pipeline.producer_commit(ab_producer_state)
+                    ab_producer_state.advance()
+
+            # ------ MMA ------ #
+            if warp_idx == self.mma_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+                mma_pipeline.producer_acquire(mma_producer_state)
+
+                for k in cutlass.range(cute.size(gA, mode=[2])):
+                    ab_pipeline.consumer_wait(ab_consumer_state)
+
+                    for kblock_idx in cutlass.range(cute.size(tCsA, mode=[2]), unroll_full=True):
+                        cute.gemm(
+                            tiled_mma,
+                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                            tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
+                            tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
+                            cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                        )
+                        tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                    ab_pipeline.consumer_release(ab_consumer_state)
+                    ab_consumer_state.advance()
+
+                mma_pipeline.producer_commit(mma_producer_state)
+                mma_producer_state.advance()
+
+            # ------ EPI ------ #
+            if warp_idx in self.epi_warp_ids:
+                cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
+
+                copy_atom_t2r = sm100_utils.get_tmem_load_op(
+                    self.cta_tile_shape_mnk,
+                    utils.LayoutEnum.ROW_MAJOR,
+                    self.acc_dtype,
+                    self.acc_dtype,
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                    self.use_2cta_instrs,
+                )
+                # [tileM, subTileN, loopM, CntSubTileN, loopN]
+                tAcc_epi = cute.flat_divide(
+                    tCtC[((None, None), 0, None)],
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                )
+                tiled_copy_t2r = tcgen05.make_tmem_copy(
+                    copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+                )
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+                tTMEM_load_tAcc = cute.group_modes(
+                    tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1
+                )
+
+                # predicates
+                cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
+                tCcAcc = thr_mma.partition_C(cAcc)
+                tCcAcc_epi = cute.flat_divide(
+                    tCcAcc[((None, None), 0, None)],
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                )
+                tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
+                tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2])
+                tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype)
+
+                copy_atom_g2r_int64 = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(), mLabels.element_type
+                )
+                copy_atom_g2r_fp32 = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(), mDlogprobs.element_type
+                )
+                epilogue_thread_layout = cute.make_layout((128, 1), stride=(1, 1))
+                tiled_copy_g2r_int64 = cute.make_tiled_copy_tv(
+                    copy_atom_g2r_int64, epilogue_thread_layout, cute.make_layout((1, 1))
+                )
+                tiled_copy_g2r_fp32 = cute.make_tiled_copy_tv(
+                    copy_atom_g2r_fp32, epilogue_thread_layout, cute.make_layout((1, 1))
+                )
+                thr_copy_g2r_int64 = tiled_copy_g2r_int64.get_slice(tidx)
+                thr_copy_g2r_fp32 = tiled_copy_g2r_fp32.get_slice(tidx)
+
+                # [tileM]
+                gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,))
+                gMaximum = cute.local_tile(mMaximum, (self.epi_tile[0],), (pidm,))
+                gAccu = cute.local_tile(mAccu, (self.epi_tile[0],), (pidm,))
+
+                # slice along M direction
+                tMCAcc = thr_copy_g2r_int64.partition_S(cAcc)[(None, None, 0)]
+                # [(1, 1), 1]
+                tMCAcc_mask = cute.make_fragment(tMCAcc.shape, cutlass.Boolean)
+                # to align shape with gMax and gAccu
+                tMCAcc_mask = cute.append_ones(tMCAcc_mask)
+                tMCAcc_mask[0] = cute.elem_less(
+                    pidm * self.epi_tile[0] + tidx, cute.size(mA, mode=[0])
+                )
+                # [(1, 1), 1, 1]
+                tMgLabels = thr_copy_g2r_int64.partition_S(cute.append_ones(gLabels))
+                tMrLabels = cute.make_fragment(tMgLabels.shape, tMgLabels.element_type)
+                cute.copy(tiled_copy_g2r_int64, tMgLabels, tMrLabels, pred=tMCAcc_mask)
+                tMgMaximum = thr_copy_g2r_fp32.partition_S(cute.append_ones(gMaximum))
+                tMrMaximum = cute.make_fragment(tMgMaximum.layout, tMgMaximum.element_type)
+                cute.copy(tiled_copy_g2r_fp32, tMgMaximum, tMrMaximum, pred=tMCAcc_mask)
+                tMgAccu = thr_copy_g2r_fp32.partition_S(cute.append_ones(gAccu))
+                tMrAccu = cute.make_fragment(tMgAccu.layout, tMgAccu.element_type)
+                cute.copy(tiled_copy_g2r_fp32, tMgAccu, tMrAccu, pred=tMCAcc_mask)
+
+                tMrDlogprobs = cute.make_fragment(tMgAccu.layout, mDlogprobs.element_type)
+                if cutlass.const_expr(self.REDUCTION == 2):
+                    # mean reduction
+                    num_valid_tokens = cute.make_tensor(scalarNumValidTokens, layout=(1,))
+                    tMrDlogprobs[0] = mDlogprobs[0] / num_valid_tokens[0].to(cutlass.Float32)
+                elif cutlass.const_expr(self.REDUCTION == 1):
+                    # sum reduction
+                    tMrDlogprobs[0] = mDlogprobs[0]
+                else:
+                    # no reduction
+                    gDlogprobs = cute.local_tile(mDlogprobs, (self.epi_tile[0],), (pidm,))
+                    tMgDlogprobs = thr_copy_g2r_fp32.partition_S(cute.append_ones(gDlogprobs))
+                    cute.copy(tiled_copy_g2r_fp32, tMgDlogprobs, tMrDlogprobs, pred=tMCAcc_mask)
+
+                tMrAccu[0] = cute.arch.rcp_approx(tMrAccu[0])
+                tMrDlogprobs[0] *= tMrLabels[0] != ignore_index
+                tMr_d_acc_exp_logits = tMrDlogprobs[0] * tMrAccu[0]
+
+                # ------ Partial output ------ #
+                # [tileM, tileN]
+                gDlogits_partial = cute.local_tile(
+                    mDlogits_partial, (self.epi_tile[0], self.epi_tile[1]), (pidm, pidn)
+                )
+                # blackwell supports STG.256
+                copy_atom_r2g = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(),
+                    gDlogits_partial.element_type,
+                    num_bits_per_copy=256,
+                )
+                tiled_copy_r2g = cute.make_tiled_copy_tv(
+                    copy_atom_r2g, epilogue_thread_layout, copy_atom_r2g.layout_dst_tv
+                )
+                thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
+
+                # [CPY, loopM, loopN]
+                tR2GCAcc = thr_copy_r2g.partition_S(cAcc)
+                tR2GCAcc_pred = cute.make_fragment(tR2GCAcc.shape, cutlass.Boolean)
+                for elem in cutlass.range(cute.size(tR2GCAcc_pred, mode=[0])):
+                    for row in cutlass.range(cute.size(tR2GCAcc_pred, mode=[1])):
+                        for col in cutlass.range(cute.size(tR2GCAcc_pred, mode=[2])):
+                            tR2GCAcc_pred[elem, row, col] = cute.elem_less(
+                                pidm * self.epi_tile[0] + tR2GCAcc[elem, row, col][0],
+                                problem_mnk[0],
+                            ) and cute.elem_less(
+                                split_idx * self.vocab_per_split
+                                + pidn * self.epi_tile[1]
+                                + tR2GCAcc[elem, row, col][1],
+                                problem_mnk[1],
+                            )
+
+                tR2GgDlogits = thr_copy_r2g.partition_D(gDlogits_partial)
+
+                # for type conversion
+                dLogits_half = cute.make_fragment(tTMEM_load_rAcc.shape, tR2GgDlogits.element_type)
+                dLogits_half = cute.tiled_divide(
+                    dLogits_half, (cute.size(tR2GgDlogits, mode=[0]), 1)
+                )
+                dLogits_half = cute.group_modes(dLogits_half, 2, cute.rank(dLogits_half))
+
+                mma_pipeline.consumer_wait(mma_consumer_state)
+
+                block_vocab_left_idx: cutlass.Int64 = (
+                    split_idx * self.vocab_per_split + pidn * self.epi_tile[1]
+                )
+                block_vocab_right_idx: cutlass.Int64 = min(
+                    split_idx * self.vocab_per_split + (pidn + 1) * self.epi_tile[1],
+                    min((split_idx + 1) * self.vocab_per_split, problem_mnk[1]),
+                )
+                num_n_subtiles: cutlass.Int64 = cute.ceil_div(
+                    (block_vocab_right_idx - block_vocab_left_idx),
+                    cute.size(tTMEM_load_rAcc, mode=[0]),
+                )
+                for n_subtile in cutlass.range(num_n_subtiles):
+                    cute.copy(
+                        tiled_copy_t2r,
+                        tTMEM_load_tAcc[(None, None, None, n_subtile, mma_consumer_state.index)],
+                        tTMEM_load_rAcc,
+                    )
+
+                    for idx in cutlass.range(
+                        cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True
+                    ):
+                        # exp_logits
+                        tTMEM_load_rAcc[idx] = cute.exp(tTMEM_load_rAcc[idx] - tMrMaximum[0])
+
+                        position: cutlass.Int64 = (
+                            rank * problem_mnk[1]
+                            + split_idx * self.vocab_per_split
+                            + pidn * self.epi_tile[1]
+                            + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
+                            + idx
+                        )
+                        mask: cutlass.Boolean = (
+                            position == tMrLabels[0] and tMrLabels[0] != ignore_index
+                        )
+                        # d_logits
+                        tTMEM_load_rAcc[idx] *= tMr_d_acc_exp_logits
+                        tTMEM_load_rAcc[idx] += mask * -tMrDlogprobs[0]
+                        dLogits_half[idx] = tTMEM_load_rAcc[idx].to(dLogits_half.element_type)
+
+                    for idx in cutlass.range(cute.size(dLogits_half, mode=[1]), unroll_full=True):
+                        copy_id = n_subtile * cute.size(dLogits_half, mode=[1]) + idx
+                        cute.copy(
+                            tiled_copy_r2g,
+                            dLogits_half[(None, idx, None)],
+                            tR2GgDlogits[(None, None, copy_id)],
+                            pred=tR2GCAcc_pred[((0, None), None, copy_id)],
+                        )
+
+                mma_pipeline.consumer_release(mma_consumer_state)
+                mma_consumer_state.advance()
+
+            # ------ Deallocate TMEM ------ #
+            self.cta_sync_barrier.arrive_and_wait()
+            if warp_idx == self.empty_warp_ids[0]:
+                cute.arch.relinquish_tmem_alloc_permit()
+                cute.arch.dealloc_tmem(
+                    tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs
+                )
+
+        @cute.jit
+        def __call__(
+            self,
+            split_idx: cutlass.Int32,
+            hidden: cute.Tensor,
+            weight: cute.Tensor,
+            labels: cute.Tensor,
+            dlogprobs: cute.Tensor,
+            maximum: cute.Tensor,
+            accu: cute.Tensor,
+            dlogits_partial: cute.Tensor,
+            scalarNumValidTokens: cute.Pointer,
+            ignore_index: cutlass.Int64,
+            rank: cutlass.Int32,
+            stream: cuda.CUstream,
+        ) -> None:
+            a_dtype: Type[cutlass.Numeric] = hidden.element_type
+            b_dtype: Type[cutlass.Numeric] = weight.element_type
+
+            if cutlass.const_expr(hidden.element_type != weight.element_type):
+                raise RuntimeError(
+                    f"data type don't match: {hidden.element_type} v.s. {weight.element_type}"
+                )
+            if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
+                raise RuntimeError("hidden can only be FP16 or BF16")
+            if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
+                raise RuntimeError("K dimension doesn't match")
+
+            problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1])
+            if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
+                raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
+            if cutlass.const_expr((problem_mnk[2] * b_dtype.width // 8) % 128 != 0):
+                raise RuntimeError(f"N dimension is not 128B aligned: {problem_mnk[1]}")
+
+            grid = self._compute_grid(
+                problem_mnk=problem_mnk,
+                cluster_shape_mn=self.cluster_shape_mn,
+                cta_tiler=self.mma_tiler,
+            )
+
+            a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
+            b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
+
+            tiled_mma = sm100_utils.make_trivial_tiled_mma(
+                a_dtype,
+                a_major_mode,
+                b_major_mode,
+                self.acc_dtype,
+                self.cta_group,
+                self.mma_tiler[:2],
+            )
+            self._setup_attributes(tiled_mma, a_dtype, b_dtype)
+
+            self.epi_tile = self.cta_tile_shape_mnk[:2]
+
+            # Swizzle o [(tileM, tileK), loopM, loopK, stage]
+            a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+                tiled_mma, self.mma_tiler, a_dtype, self.num_ab_stage
+            )
+            # Swizzle o [(tileN, tileK), loopN, loopK, stage]
+            b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+                tiled_mma, self.mma_tiler, b_dtype, self.num_ab_stage
+            )
+            tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
+            tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
+
+            # Swizzle o [(tileM, tileK), loopM, loopK]
+            a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2])
+            tma_atom_a, tma_tensor_a = cute.nvgpu.make_tiled_tma_atom_A(
+                tma_load_op,
+                hidden,
+                a_smem_layout,
+                self.mma_tiler,
+                tiled_mma,
+                self.cluster_layout_vmnk.shape,
+            )
+            # Swizzle o [(tileN, tileK), loopN, loopK]
+            b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2])
+            tma_atom_b, tma_tensor_b = cute.nvgpu.make_tiled_tma_atom_B(
+                tma_load_op,
+                weight,
+                b_smem_layout,
+                self.mma_tiler,
+                tiled_mma,
+                self.cluster_layout_vmnk.shape,
+            )
+            a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
+            b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
+            self.tma_copy_ab_bytes = a_copy_size + b_copy_size
+
+            @cute.struct
+            class SharedStorage:
+                """
+                The shared storage for the backward kernel.
+                """
+
+                load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_ab_stage * 2]
+                mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+
+                tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
+                tmem_holding_buf: cutlass.Int32
+
+                sA: cute.struct.Align[
+                    cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)],
+                    self.buffer_align_bytes,
+                ]
+                sB: cute.struct.Align[
+                    cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
+                    self.buffer_align_bytes,
+                ]
+
+            self.shared_storage = SharedStorage
+
+            self.kernel(
+                split_idx,
+                tiled_mma,
+                tma_atom_a,
+                tma_tensor_a,
+                tma_atom_b,
+                tma_tensor_b,
+                labels,
+                dlogprobs,
+                maximum,
+                accu,
+                dlogits_partial,
+                scalarNumValidTokens,
+                ignore_index,
+                a_smem_layout_staged,
+                b_smem_layout_staged,
+                self.cluster_layout_vmnk,
+                problem_mnk,
+                rank,
+            ).launch(
+                grid=grid,
+                block=[self.threads_per_cta, 1, 1],
+                cluster=self.cluster_shape_mnk,
+                stream=stream,
+            )
+
+except ImportError:
+    logging.warning("Cutlass or CUDA bindings not found. BwdPartialDlogits will not be available.")
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
new file mode 100644
index 00000000000..07e018b51ff
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/entry.py
@@ -0,0 +1,480 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+import logging
+import os
+import typing
+from dataclasses import dataclass, field
+from functools import lru_cache
+
+try:
+    import cuda.bindings.driver as cuda  # type: ignore
+    import cutlass
+    import cutlass.cute as cute
+    import torch
+    import torch.distributed as dist
+    import triton  # type: ignore
+    from cutlass.cute.runtime import from_dlpack
+
+    import megatron.core.fusions.linear_cross_entropy.utils as utils
+    from megatron.core.fusions.linear_cross_entropy.blackwell import (
+        bwd_partial_dlogits as bwd_partial_dlogits,
+    )
+    from megatron.core.fusions.linear_cross_entropy.blackwell import fwd_mainloop as fwd_mainloop
+    from megatron.core.fusions.linear_cross_entropy.blackwell import triton as triton_kernels
+
+    @dataclass
+    class FwdConfig:
+        """
+        The configuration for the forward pass.
+        """
+
+        _dedicated_stream: torch.cuda.Stream = field(default_factory=torch.cuda.Stream)
+        _dedicated_events: typing.List[torch.cuda.Event] = field(default_factory=list)
+        _initialized: bool = field(default=False)
+        _fwd_mainloop_kernels: typing.Dict[str, cute.kernel] = field(default_factory=dict)
+        _vocab_per_split: int = field(
+            default=int(os.environ.get("LCE_FWD_VOCAB_SPLIT_SIZE", 512 * 6))
+        )
+
+    @dataclass
+    class BwdConfig:
+        """
+        The configuration for the backward pass.
+        """
+
+        _bwd_kernel: typing.Dict[str, cute.kernel] = field(default_factory=dict)
+        _vocab_per_split: int = field(
+            default=int(os.environ.get("LCE_BWD_VOCAB_SPLIT_SIZE", 512 * 6))
+        )
+        _backward_method: utils.BackwardMethodEnum = field(
+            default=utils.BackwardMethodEnum.kDlogitsSplitN
+        )
+
+    @lru_cache(maxsize=1)
+    def _get_fwd_config() -> FwdConfig:
+        """
+        Helper function to lazy initialize the forward configuration.
+        """
+        return FwdConfig()
+
+    @lru_cache(maxsize=1)
+    def _get_bwd_config() -> BwdConfig:
+        """
+        Helper function to lazy initialize the backward configuration.
+        """
+        return BwdConfig()
+
+    def forward(
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        tp_group: typing.Optional[torch.distributed.ProcessGroup] = None,
+        reduction: typing.Literal["none", "sum", "mean"] = "mean",
+        ignore_index: int = -100,
+        sequence_parallel: bool = False,
+    ) -> typing.Tuple[
+        torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, int, int, torch.Tensor
+    ]:
+        """
+        forward host function
+        """
+        tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
+        tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
+        in_tp_mode = (tp_group is not None) and (tp_world_size > 1)
+
+        assert hidden.is_cuda and weight.is_cuda and labels.is_cuda
+        assert weight.device == hidden.device and labels.device == hidden.device
+
+        # hidden could be [batch, seqlen, dim] or [seqlen, batch, dim] or [tokens, dim]
+        assert hidden.dim() == 2 or hidden.dim() == 3
+        # weight must be [vocab_size, dim]
+        assert weight.dim() == 2
+        # labels could be [batch, seqlen] or [seqlen, batch] or [tokens]
+        assert (hidden.dim() == 2 and labels.dim() == 1) or (
+            hidden.dim() == 3 and labels.dim() == 2
+        )
+        assert hidden.is_contiguous() and weight.is_contiguous() and labels.is_contiguous()
+
+        hidden_view = hidden.view(-1, hidden.shape[-1])
+        labels_view = labels.view(-1)
+
+        assert (
+            sequence_parallel and hidden_view.shape[0] * tp_world_size == labels_view.shape[0]
+        ) or (not sequence_parallel and hidden_view.shape[0] == labels_view.shape[0])
+        assert hidden_view.shape[1] == weight.shape[1]
+
+        global_hidden = hidden
+        if in_tp_mode and sequence_parallel:
+            partial_hidden_shape = hidden.shape
+            global_hidden_shape = (
+                partial_hidden_shape[0] * tp_world_size,
+                *partial_hidden_shape[1:],
+            )
+            global_hidden = torch.empty(
+                global_hidden_shape, dtype=hidden.dtype, device=hidden.device
+            )
+            dist.all_gather_into_tensor(global_hidden, hidden, group=tp_group)
+            assert global_hidden.is_contiguous()
+            hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
+
+        num_tokens, dim = hidden_view.shape
+        vocab_size, _ = weight.shape
+
+        if not _get_fwd_config()._initialized:
+            _get_fwd_config()._dedicated_stream = torch.cuda.Stream(hidden.device)
+            _get_fwd_config()._dedicated_events = [torch.cuda.Event() for _ in range(2)]
+            _get_fwd_config()._initialized = True
+
+        REDUCTION = utils.str_to_reduction_enum(reduction)
+        # declare logprobs
+        if REDUCTION == utils.EntropyReductionEnum.kNone:
+            logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+            if in_tp_mode:
+                logprobs.zero_()
+        else:
+            logprobs = torch.zeros((), device=hidden.device, dtype=torch.float32)
+        # declare auxiliary tensors
+        maximum = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+        accumulate = torch.empty_like(maximum, dtype=torch.float32)
+        num_valid_tokens = torch.empty((), device=hidden.device, dtype=torch.int64)
+        assert (
+            maximum.is_contiguous()
+            and accumulate.is_contiguous()
+            and num_valid_tokens.is_contiguous()
+        )
+        # declare intermediate tensors
+        # NOTE: this is a parameter for tuning
+        num_splits = (
+            vocab_size + _get_fwd_config()._vocab_per_split - 1
+        ) // _get_fwd_config()._vocab_per_split
+        _max = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
+        _accu = torch.empty((num_tokens, num_splits), device=hidden.device, dtype=torch.float32)
+        if REDUCTION == utils.EntropyReductionEnum.kNone:
+            _logprobs = logprobs
+        else:
+            _logprobs = torch.empty((num_tokens,), device=hidden.device, dtype=torch.float32)
+            if in_tp_mode:
+                _logprobs.zero_()
+        assert _max.is_contiguous() and _accu.is_contiguous() and _logprobs.is_contiguous()
+
+        triton_kernels.get_num_valid_tokens[(1,)](
+            num_tokens, ignore_index, labels_view, labels_view.stride(0), num_valid_tokens
+        )
+
+        # need to compile the kernel for the first time
+        hidden_packed = from_dlpack(
+            hidden_view.detach(), assumed_align=16
+        ).mark_compact_shape_dynamic(mode=0)
+        weight_packed = from_dlpack(weight.detach(), assumed_align=16)
+        labels_packed = from_dlpack(
+            labels_view.detach(), assumed_align=8
+        ).mark_compact_shape_dynamic(mode=0)
+        logprobs_packed = from_dlpack(_logprobs, assumed_align=16).mark_compact_shape_dynamic(
+            mode=0
+        )
+        _max_packed = from_dlpack(_max, assumed_align=8).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
+        _accu_packed = from_dlpack(_accu, assumed_align=8).mark_compact_shape_dynamic(
+            mode=0, stride_order=(0, 1)
+        )
+        cuda_stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+        # VocabSize and Dim are fixed for a given model,
+        # only the number of tokens can vary
+        key = f"vocab_size:{vocab_size}+dim:{dim}+dtype:{hidden_view.dtype}"
+        if _get_fwd_config()._fwd_mainloop_kernels.get(key) is None:
+            fwd_mainloop_kernel = fwd_mainloop.FwdMainLoop(
+                vocab_per_split=_get_fwd_config()._vocab_per_split
+            )
+            fwd_mainloop_compiled_kernel = cute.compile(
+                fwd_mainloop_kernel,
+                hidden_packed,
+                weight_packed,
+                labels_packed,
+                logprobs_packed,
+                _max_packed,
+                _accu_packed,
+                ignore_index,
+                tp_rank,
+                cuda_stream,
+            )
+            _get_fwd_config()._fwd_mainloop_kernels[key] = fwd_mainloop_compiled_kernel
+        else:
+            fwd_mainloop_compiled_kernel = _get_fwd_config()._fwd_mainloop_kernels[key]
+        fwd_mainloop_compiled_kernel(
+            hidden_packed,
+            weight_packed,
+            labels_packed,
+            logprobs_packed,
+            _max_packed,
+            _accu_packed,
+            ignore_index,
+            tp_rank,
+            cuda_stream,
+        )
+
+        if not in_tp_mode:
+
+            def grid(meta):
+                return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
+
+            triton_kernels.forward_dp_epilogue[grid](
+                num_tokens,
+                num_splits,
+                ignore_index,
+                labels_view,
+                labels_view.stride(0),
+                num_valid_tokens,
+                _max,
+                _max.stride(0),
+                _max.stride(1),
+                _accu,
+                _accu.stride(0),
+                _accu.stride(1),
+                maximum,
+                maximum.stride(0),
+                accumulate,
+                maximum.stride(0),
+                _logprobs,
+                _logprobs.stride(0),
+                logprobs,
+                triton.language.constexpr(REDUCTION.value),
+            )
+        else:
+            _max_backup = _max.clone()
+            dist.all_reduce(_max, op=dist.ReduceOp.MAX, group=tp_group)
+
+            torch.cuda.current_stream().record_event(_get_fwd_config()._dedicated_events[0])
+            with torch.cuda.stream(_get_fwd_config()._dedicated_stream):
+                _get_fwd_config()._dedicated_stream.wait_event(
+                    _get_fwd_config()._dedicated_events[0]
+                )
+                dist.all_reduce(_logprobs, op=dist.ReduceOp.SUM, group=tp_group)
+                _get_fwd_config()._dedicated_stream.record_event(
+                    _get_fwd_config()._dedicated_events[1]
+                )
+
+            def grid(meta):
+                return (triton.cdiv(num_tokens, meta["BLOCK_SIZE_M"]),)
+
+            triton_kernels.forward_tp_epilogue[grid](
+                num_tokens,
+                num_splits,
+                _max,
+                _max.stride(0),
+                _max.stride(1),
+                _max_backup,
+                _max_backup.stride(0),
+                _max_backup.stride(1),
+                _accu,
+                _accu.stride(0),
+                _accu.stride(1),
+                maximum,
+                maximum.stride(0),
+                accumulate,
+                maximum.stride(0),
+            )
+            # reduce accumulate
+            dist.all_reduce(accumulate, op=dist.ReduceOp.SUM, group=tp_group)
+
+            # update logprobs
+            torch.cuda.current_stream().wait_event(_get_fwd_config()._dedicated_events[1])
+            triton_kernels.forward_tp_epilogue_update_logprobs[grid](
+                num_tokens,
+                ignore_index,
+                num_valid_tokens,
+                labels_view,
+                labels_view.stride(0),
+                _logprobs,
+                _logprobs.stride(0),
+                maximum,
+                maximum.stride(0),
+                accumulate,
+                accumulate.stride(0),
+                logprobs,
+                REDUCTION.value,
+            )
+
+        return (
+            logprobs,
+            maximum,
+            accumulate,
+            num_valid_tokens,
+            tp_rank,
+            tp_world_size,
+            global_hidden,
+        )
+
+    def backward(
+        dlogprobs: torch.Tensor,
+        global_hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        maximum: torch.Tensor,
+        accu: torch.Tensor,
+        num_valid_tokens: torch.Tensor,
+        reduction: typing.Literal["none", "sum", "mean"] = "mean",
+        ignore_index: int = -100,
+        tp_group: typing.Optional[dist.ProcessGroup] = None,
+        tp_rank: int = 0,
+        tp_world_size: int = 1,
+        sequence_parallel: bool = False,
+    ) -> typing.Tuple[torch.Tensor, torch.Tensor]:
+        """
+        backward host function
+        """
+        in_tp_mode = (tp_group is not None) and (tp_world_size > 1)
+
+        hidden_view = global_hidden.view(-1, global_hidden.shape[-1])
+        labels_view = labels.view(-1)
+
+        num_tokens, dim = hidden_view.shape
+        vocab_size, _ = weight.shape
+
+        REDUCTION = utils.str_to_reduction_enum(reduction)
+        dlogprobs_view = dlogprobs.view(-1)
+        assert (
+            REDUCTION == utils.EntropyReductionEnum.kNone and dlogprobs.shape == (num_tokens,)
+        ) or (REDUCTION != utils.EntropyReductionEnum.kNone and dlogprobs.dim() == 0)
+        assert dlogprobs.is_contiguous() and dlogprobs.is_cuda
+
+        assert (
+            num_valid_tokens.dim() == 0
+            and num_valid_tokens.is_cuda
+            and num_valid_tokens.dtype == torch.int64
+        )
+
+        # Allocate d_hidden in float32 for better numerical stability
+        d_hidden = torch.empty_like(global_hidden, dtype=torch.float32)
+        d_weight = torch.empty_like(weight)
+        assert d_hidden.is_contiguous() and d_weight.is_contiguous()
+
+        # FIXME: implement different backward methods
+        _backward_method = _get_bwd_config()._backward_method
+        if _backward_method == utils.BackwardMethodEnum.kDlogitsSplitN:
+            vocab_per_split = _get_bwd_config()._vocab_per_split
+            num_splits = (vocab_size + vocab_per_split - 1) // vocab_per_split
+
+            _d_logits = torch.empty(
+                (num_tokens, vocab_per_split),
+                device=global_hidden.device,
+                dtype=global_hidden.dtype,
+            )
+
+            hidden_packed = from_dlpack(
+                hidden_view.detach(), assumed_align=16
+            ).mark_compact_shape_dynamic(mode=0)
+            weight_packed = from_dlpack(weight.detach(), assumed_align=16)
+            labels_packed = from_dlpack(
+                labels_view.detach(), assumed_align=8
+            ).mark_compact_shape_dynamic(mode=0)
+            dlogprobs_packed = from_dlpack(
+                dlogprobs_view.detach(), assumed_align=8
+            ).mark_compact_shape_dynamic(mode=0)
+            maximum_packed = from_dlpack(
+                maximum.detach(), assumed_align=8
+            ).mark_compact_shape_dynamic(mode=0)
+            accu_packed = from_dlpack(accu.detach(), assumed_align=8).mark_compact_shape_dynamic(
+                mode=0
+            )
+            dlogits_packed = from_dlpack(_d_logits, assumed_align=32).mark_compact_shape_dynamic(
+                mode=0
+            )
+            scalarNumValidTokens_packed = cute.runtime.make_ptr(
+                cutlass.Int64, num_valid_tokens.data_ptr(), cute.AddressSpace.gmem, assumed_align=8
+            )
+
+            stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+            key = (
+                f"vocab_size:{vocab_size}+dim:{dim}+reduction:{REDUCTION}+dtype:{hidden_view.dtype}"
+            )
+            if _get_bwd_config()._bwd_kernel.get(key) is None:
+                bwd_kernel = bwd_partial_dlogits.BwdPartialDlogits(
+                    reduction=REDUCTION.value, vocab_per_split=vocab_per_split
+                )
+                bwd_kernel_compiled = cute.compile(
+                    bwd_kernel,
+                    0,  # split_idx
+                    hidden_packed,
+                    weight_packed,
+                    labels_packed,
+                    dlogprobs_packed,
+                    maximum_packed,
+                    accu_packed,
+                    dlogits_packed,
+                    scalarNumValidTokens_packed,
+                    ignore_index,
+                    tp_rank,
+                    stream,
+                )
+                _get_bwd_config()._bwd_kernel[key] = bwd_kernel_compiled
+            else:
+                bwd_kernel_compiled = _get_bwd_config()._bwd_kernel.get(key)
+
+            for split_idx in range(num_splits):
+                bwd_kernel_compiled(
+                    split_idx,
+                    hidden_packed,
+                    weight_packed,
+                    labels_packed,
+                    dlogprobs_packed,
+                    maximum_packed,
+                    accu_packed,
+                    dlogits_packed,
+                    scalarNumValidTokens_packed,
+                    ignore_index,
+                    tp_rank,
+                    stream,
+                )
+                # remove padding areas
+                # cublas can handle non-contiguous tensors
+                # therefore, we do not need to contiguous the tensor
+                vocab_right_bound = (
+                    min((split_idx + 1) * vocab_per_split, vocab_size) - split_idx * vocab_per_split
+                )
+                valid_d_logits = _d_logits[:, :vocab_right_bound]
+
+                _delta_hidden = torch.mm(
+                    valid_d_logits,
+                    weight[split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :],
+                    out_dtype=torch.float32,
+                ).view_as(d_hidden)
+                if split_idx == 0:
+                    d_hidden.copy_(_delta_hidden)
+                else:
+                    d_hidden.add_(_delta_hidden)
+                torch.matmul(
+                    valid_d_logits.T,
+                    hidden_view,
+                    out=d_weight[
+                        split_idx * vocab_per_split : (split_idx + 1) * vocab_per_split, :
+                    ],
+                )
+        else:
+            raise NotImplementedError(f"Unsupported backward method: {_backward_method}")
+
+        if in_tp_mode:
+            dist.all_reduce(d_hidden, op=dist.ReduceOp.SUM, group=tp_group)
+            if sequence_parallel:
+                partial_hidden_shape = (
+                    global_hidden.shape[0] // tp_world_size,
+                    *global_hidden.shape[1:],
+                )
+                partial_num_tokens = num_tokens // tp_world_size
+                d_hidden = d_hidden.view(-1, d_hidden.shape[-1])[
+                    tp_rank * partial_num_tokens : (tp_rank + 1) * partial_num_tokens, :
+                ]
+                d_hidden = d_hidden.view(partial_hidden_shape).clone()
+
+        # convert d_hidden to the original dtype
+        d_hidden = d_hidden.type_as(global_hidden)
+
+        return d_hidden, d_weight
+
+except ImportError:
+    logging.warning(
+        "Cutlass or CUDA bindings not found. LinearCrossEntropy Blackwell entry "
+        "points will not be available."
+    )
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
new file mode 100644
index 00000000000..93f5b9523e7
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/fwd_mainloop.py
@@ -0,0 +1,693 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+"""
+Implementations of the fusion lm_head(Linear) + Cross-Entropy kernel
+"""
+
+import logging
+from typing import Tuple, Type
+
+try:
+    import cuda.bindings.driver as cuda  # type: ignore
+    import cutlass
+    import cutlass.cute as cute
+    import cutlass.pipeline as pipeline  # type: ignore
+    import cutlass.utils as utils  # type: ignore
+    import cutlass.utils.blackwell_helpers as sm100_utils  # type: ignore
+    from cutlass.cute.nvgpu import cpasync, tcgen05
+
+    SM100_TMEM_CAPACITY_COLUMNS: int = 512
+
+    def make_thread_cooperative_group(size: int):
+        """
+        Create a thread cooperative group.
+        """
+        return pipeline.CooperativeGroup(pipeline.Agent.Thread, size, alignment=size)
+
+    class FwdMainLoop:
+        """
+        This class implements the mainloop for forward process.
+
+        Traits stored as attributes.
+
+        :param acc_dtype:
+        """
+
+        def __init__(
+            self,
+            acc_dtype: Type[cutlass.Numeric] = cutlass.Float32,
+            use_2cta_instrs: bool = False,
+            mma_tiler_mn: Tuple[int, int] = (128, 256),
+            vocab_per_split: int = 512,
+        ):
+            """
+            Configuration including:
+                - MMA instruction settings
+                - Cluster Shape
+            """
+            self.acc_dtype: Type[cutlass.Numeric] = acc_dtype
+            self.use_2cta_instrs = use_2cta_instrs
+            # This is the shape covered by tiledMMA, not just single MMA instruction
+            self.mma_tiler = (*mma_tiler_mn, 1)
+            self.cta_tiler = (self.mma_tiler[0], vocab_per_split, self.mma_tiler[2])
+            self.vocab_per_split = vocab_per_split
+
+            self.cta_group = tcgen05.CtaGroup.TWO if self.use_2cta_instrs else tcgen05.CtaGroup.ONE
+            self.cluster_shape_mn = (2, 1) if self.use_2cta_instrs else (1, 1)
+
+            self.occupancy = 1
+            # query SMEM capacity
+            self.smem_capacity = utils.get_smem_capacity_in_bytes("sm_100")
+
+            # the maximum columns per MMA is 256, and there is only one GEMM, so we can fully
+            # assign TMEM for that GEMM of different tiles.
+            # so 512 = 2 * 256
+
+            self.threads_per_warp: int = 32
+            # 1 warp for loading, 1 warp for issuing MMA, 1 WG for storing
+            self.epi_warp_ids = (0, 1, 2, 3)
+            self.load_warp_ids = 4
+            self.mma_warp_ids = 5
+            self.empty_warp_ids = (6, 7)
+
+            self.threads_per_cta: int = self.threads_per_warp * len(
+                (*self.epi_warp_ids, self.load_warp_ids, self.mma_warp_ids, *self.empty_warp_ids)
+            )
+
+            self.cta_sync_barrier = pipeline.NamedBarrier(
+                barrier_id=1, num_threads=self.threads_per_cta
+            )
+            self.tmem_alloc_barrier = pipeline.NamedBarrier(
+                barrier_id=2, num_threads=self.threads_per_cta
+            )
+
+            self.buffer_align_bytes: int = 1024
+            self.num_regs_other: int = 32
+            self.num_regs_epi: int = 192
+
+        def _compute_stages(
+            self,
+            tiled_mma: cute.TiledMma,
+            mma_tiler: Tuple[int, int, int],
+            a_dtype: Type[cutlass.Numeric],
+            b_dtype: Type[cutlass.Numeric],
+        ):
+            a_smem_layout_stage_one = sm100_utils.make_smem_layout_a(
+                tiled_mma, mma_tiler, a_dtype, 1  # only single stage
+            )
+            b_smem_layout_stage_one = sm100_utils.make_smem_layout_b(
+                tiled_mma, mma_tiler, b_dtype, 1
+            )
+            a_bytes_per_stage = cute.size_in_bytes(a_dtype, a_smem_layout_stage_one)
+            b_bytes_per_stage = cute.size_in_bytes(b_dtype, b_smem_layout_stage_one)
+            num_acc_stage = 2
+            num_a_stage = 4
+            num_b_stage = 4
+            num_epi_stage_per_tile = 4
+
+            return num_acc_stage, num_a_stage, num_b_stage, num_epi_stage_per_tile
+
+        def _setup_attributes(
+            self,
+            tiled_mma: cute.TiledMma,
+            a_dtype: Type[cutlass.Numeric],
+            b_dtype: Type[cutlass.Numeric],
+        ):
+            self.cluster_shape_mnk = (*self.cluster_shape_mn, 1)
+            self.cluster_layout_vmnk = cute.tiled_divide(
+                cute.make_layout(self.cluster_shape_mnk), (tiled_mma.thr_id.shape,)
+            )
+
+            # this is fixed for dense MMA, k=16
+            mma_inst_shape_k = cute.size(tiled_mma.shape_mnk, mode=[2])
+            # 16*4 = 64; 64 * sizeof(FP16) = 128Bytes
+            mma_inst_tile_k: int = 4
+            self.mma_tiler = (
+                self.mma_tiler[0],
+                self.mma_tiler[1],
+                mma_inst_shape_k * mma_inst_tile_k,
+            )
+
+            self.num_acc_stage, self.num_a_stage, self.num_b_stage, self.num_epi_stage_per_tile = (
+                self._compute_stages(tiled_mma, self.mma_tiler, a_dtype, b_dtype)
+            )
+            self.tmem_alloc_cols = self.num_acc_stage * self.mma_tiler[1]
+            assert self.tmem_alloc_cols <= SM100_TMEM_CAPACITY_COLUMNS
+
+            self.cta_tile_shape_mnk = (
+                self.mma_tiler[0] // cute.size(tiled_mma.thr_id.shape),
+                self.mma_tiler[1],
+                self.mma_tiler[2],
+            )
+
+        @cute.kernel
+        def kernel(
+            self,
+            tiled_mma: cute.TiledMma,
+            tma_atom_a: cute.CopyAtom,
+            mA: cute.Tensor,
+            tma_atom_b: cute.CopyAtom,
+            mB: cute.Tensor,
+            mLabels: cute.Tensor,
+            mMax: cute.Tensor,
+            mAccu: cute.Tensor,
+            mLogprobs: cute.Tensor,
+            a_smem_layout_staged: cute.ComposedLayout,
+            b_smem_layout_staged: cute.ComposedLayout,
+            cluster_layout_vmnk: cute.Layout,
+            problem_mnk: Tuple[int, int, int],
+            ignore_index: cutlass.Int64,
+            rank: cutlass.Int32,
+        ):
+            """
+            The forward kernel for the mainloop.
+            """
+            warp_idx = cute.arch.make_warp_uniform(cute.arch.warp_idx())
+            tidx, _, _ = cute.arch.thread_idx()
+            bidx, bidy, _ = cute.arch.block_idx()
+            # FIXME: block swizzling applied here
+            pidm, pidn = bidx, bidy
+
+            # prefetch tma descriptors
+            if warp_idx == self.load_warp_ids:
+                cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_a)
+                cute.nvgpu.cpasync.prefetch_descriptor(tma_atom_b)
+
+            # declare SMEM
+            smem = utils.SmemAllocator()
+            storage = smem.allocate(self.shared_storage)
+
+            ab_pipeline = pipeline.PipelineTmaUmma.create(
+                num_stages=self.num_a_stage,
+                producer_group=make_thread_cooperative_group(len([self.load_warp_ids])),
+                consumer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+                tx_count=self.tma_copy_a_bytes + self.tma_copy_b_bytes,
+                barrier_storage=storage.load_ab_mbar_ptr.data_ptr(),
+            )
+            ab_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_a_stage
+            )
+            ab_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_a_stage
+            )
+
+            mma_pipeline = pipeline.PipelineUmmaAsync.create(
+                num_stages=self.num_acc_stage,
+                producer_group=make_thread_cooperative_group(len([self.mma_warp_ids])),
+                consumer_group=make_thread_cooperative_group(
+                    self.threads_per_warp * len(self.epi_warp_ids)
+                ),
+                barrier_storage=storage.mma_mbar_ptr.data_ptr(),
+            )
+            mma_producer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Producer, self.num_acc_stage
+            )
+            mma_consumer_state = pipeline.make_pipeline_state(
+                pipeline.PipelineUserType.Consumer, self.num_acc_stage
+            )
+
+            tmem_dealloc_mbar_ptr = storage.tmem_dealloc_mbar_ptr.data_ptr()
+            if warp_idx == self.empty_warp_ids[0]:
+                with cute.arch.elect_one():
+                    cute.arch.mbarrier_init(
+                        tmem_dealloc_mbar_ptr, self.threads_per_warp * len(self.epi_warp_ids)
+                    )
+                    cute.arch.mbarrier_init_fence()
+
+            # -------- SMEM partition ------------ #
+            # swizzle o [(tileM, tileK), loopM, loopK, Stage]
+            sA = storage.sA.get_tensor(
+                a_smem_layout_staged.outer, swizzle=a_smem_layout_staged.inner
+            )
+            # swizzle o [(tileN, tileK), loopN, loopK, stage]
+            sB = storage.sB.get_tensor(
+                b_smem_layout_staged.outer, swizzle=b_smem_layout_staged.inner
+            )
+
+            # FIXME: if 2 CTAs, modify here
+            thr_mma = tiled_mma.get_slice(0)
+            # [MMA, loopM, loopK, stage]
+            tCsA = thr_mma.make_fragment_A(sA)
+            # [MMA, loopN, loopK, stage]
+            tCsB = thr_mma.make_fragment_B(sB)
+
+            # ---------- GMEM partition ----------- #
+            # [tileM, tileK, loopK]
+            gA = cute.local_tile(mA, (self.mma_tiler[0], self.mma_tiler[2]), (pidm, None))
+
+            # [vocab_size_per_split, dim]
+            mB_n = cute.local_tile(
+                mB, (self.vocab_per_split, cute.size(mB.layout.shape, mode=[1])), (pidn, 0)
+            )
+
+            # [tileN, tileK, loopN, loopK]
+            gB = cute.local_tile(mB_n, (self.mma_tiler[1], self.mma_tiler[2]), (None, None))
+
+            # [MMA, tileCntM, tileCntK, loopK]
+            tCgA = thr_mma.partition_A(gA)
+            # [MMA, tileCntN, tileCntK, loopN, loopK]
+            tCgB = thr_mma.partition_B(gB)
+
+            a_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, 0, None, 0)).shape)
+            # FIXME: if 2 CTAs, modify here
+            cta_rank_in_cluster = 0
+            block_in_cluster_coord_vmnk = cluster_layout_vmnk.get_flat_coord(cta_rank_in_cluster)
+            tTMAsA, tTMAgA = cpasync.tma_partition(
+                tma_atom_a,
+                block_in_cluster_coord_vmnk[2],  # cta_coord,
+                a_cta_layout,
+                cute.group_modes(sA, 0, 3),  # SMEM tensor
+                cute.group_modes(tCgA, 0, 3),  # GMEM tensor
+            )
+            b_cta_layout = cute.make_layout(cute.slice_(cluster_layout_vmnk, (0, None, 0, 0)).shape)
+            tTMAsB, tTMAgB = cpasync.tma_partition(
+                tma_atom_b,
+                block_in_cluster_coord_vmnk[1],  # cta_coord
+                b_cta_layout,
+                cute.group_modes(sB, 0, 3),
+                cute.group_modes(tCgB, 0, 3),
+            )
+
+            # Allocate TMEM
+            tmem_holding_buf = storage.tmem_holding_buf
+            if warp_idx == self.empty_warp_ids[0]:
+                cute.arch.alloc_tmem(
+                    self.tmem_alloc_cols, tmem_holding_buf, is_two_cta=self.use_2cta_instrs
+                )
+            self.cta_sync_barrier.arrive_and_wait()
+            tmem_ptr = cute.arch.retrieve_tmem_ptr(
+                self.acc_dtype, alignment=16, ptr_to_buffer_holding_addr=tmem_holding_buf
+            )
+
+            # [(tileM, tileN), loopM, loopN]
+            tmem_shape = (128, self.tmem_alloc_cols)
+            acc_shape = thr_mma.partition_shape_C(tmem_shape)
+            tCtC_fake = thr_mma.make_fragment_C(acc_shape)
+            tCtC = cute.make_tensor(tmem_ptr, tCtC_fake.layout)
+
+            block_vocab_left_idx: cutlass.Int64 = pidn * self.vocab_per_split
+            block_vocab_right_idx: cutlass.Int64 = min(
+                (pidn + 1) * self.vocab_per_split, problem_mnk[1]
+            )
+            num_n_tiles: cutlass.Int64 = cute.ceil_div(
+                (block_vocab_right_idx - block_vocab_left_idx), self.mma_tiler[1]
+            )
+
+            # ///////
+            # empty
+            # ///////
+            if warp_idx in self.empty_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+            # ///////
+            # load
+            # ///////
+            if warp_idx == self.load_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                for n in cutlass.range(num_n_tiles):
+                    for k in cutlass.range(cute.size(gA, mode=[2])):
+                        ab_pipeline.producer_acquire(ab_producer_state)
+                        cute.copy(
+                            tma_atom_a,
+                            tTMAgA[(None, k)],
+                            tTMAsA[(None, ab_producer_state.index)],
+                            tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        )
+                        cute.copy(
+                            tma_atom_b,
+                            tTMAgB[(None, n, k)],
+                            tTMAsB[(None, ab_producer_state.index)],
+                            tma_bar_ptr=ab_pipeline.producer_get_barrier(ab_producer_state),
+                        )
+                        ab_pipeline.producer_commit(ab_producer_state)
+                        ab_producer_state.advance()
+
+            # ///////
+            # mma
+            # ///////
+            if warp_idx == self.mma_warp_ids:
+                cute.arch.warpgroup_reg_dealloc(self.num_regs_other)
+
+                for n in cutlass.range(num_n_tiles):
+                    # disable accumulate for the first tile
+                    tiled_mma.set(tcgen05.Field.ACCUMULATE, False)
+                    mma_pipeline.producer_acquire(mma_producer_state)
+
+                    for k in cutlass.range(cute.size(gA, mode=[2])):
+                        ab_pipeline.consumer_wait(ab_consumer_state)
+
+                        for kblock_idx in cutlass.range(
+                            cute.size(tCsA, mode=[2]), unroll_full=True
+                        ):
+                            cute.gemm(
+                                tiled_mma,
+                                cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                                tCsA[(None, None, kblock_idx, ab_consumer_state.index)],
+                                tCsB[(None, None, kblock_idx, ab_consumer_state.index)],
+                                cute.append_ones(tCtC[(None, None, mma_producer_state.index)]),
+                            )
+                            # enable accumulate for the next tile
+                            tiled_mma.set(tcgen05.Field.ACCUMULATE, True)
+
+                        ab_pipeline.consumer_release(ab_consumer_state)
+                        ab_consumer_state.advance()
+
+                    mma_pipeline.producer_commit(mma_producer_state)
+                    mma_producer_state.advance()
+
+            # //////////
+            # epilogue
+            # //////////
+            if warp_idx in self.epi_warp_ids:
+                cute.arch.warpgroup_reg_alloc(self.num_regs_epi)
+
+                # epilog TMEM copy and partition
+                copy_atom_t2r = sm100_utils.get_tmem_load_op(
+                    self.cta_tile_shape_mnk,
+                    utils.LayoutEnum.ROW_MAJOR,  # This is hard-coded
+                    self.acc_dtype,
+                    self.acc_dtype,
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                    self.use_2cta_instrs,
+                )
+                # [tileM, subTileN, loopM, CntSubTileN, loopN]
+                tAcc_epi = cute.flat_divide(
+                    tCtC[((None, None), 0, None)],
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                )
+                tiled_copy_t2r = tcgen05.make_tmem_copy(
+                    copy_atom_t2r, tAcc_epi[(None, None, 0, 0, 0)]
+                )
+                thr_copy_t2r = tiled_copy_t2r.get_slice(tidx)
+                tTMEM_load_tAcc = thr_copy_t2r.partition_S(tAcc_epi)
+                # [(pattern), loopM, loopN, CntTileM, CntTileN]
+                tTMEM_load_tAcc = cute.group_modes(
+                    tTMEM_load_tAcc, 3, cute.rank(tTMEM_load_tAcc) - 1
+                )
+
+                cAcc = cute.make_identity_tensor(self.mma_tiler[:2])
+                tCcAcc = thr_mma.partition_C(cAcc)
+                # [tileM, subTileN, loopM, CntSubTileN, CntTileN]
+                tCcAcc_epi = cute.flat_divide(
+                    tCcAcc[((None, None), 0, None)],
+                    (self.epi_tile[0], self.epi_tile[1] // self.num_epi_stage_per_tile),
+                )
+                tTMEM_load_cAcc = thr_copy_t2r.partition_D(tCcAcc_epi)
+                tTMEM_load_cAcc_shape = cute.select(tTMEM_load_cAcc.shape, mode=[0, 1, 2])
+
+                # epilogue layouts
+                epilogue_thread_layout = cute.make_layout((128, 1))
+                copy_atom_g2r = cute.make_copy_atom(
+                    cute.nvgpu.CopyUniversalOp(), mLabels.element_type
+                )
+                tiled_copy_g2r = cute.make_tiled_copy(
+                    copy_atom_g2r, epilogue_thread_layout, (128, 1)
+                )
+                thr_copy_g2r = tiled_copy_g2r.get_slice(tidx)
+
+                copy_atom_r2g = cute.make_copy_atom(cute.nvgpu.CopyUniversalOp(), cutlass.Float32)
+                tiled_copy_r2g = cute.make_tiled_copy(
+                    copy_atom_r2g, epilogue_thread_layout, (128, 1)
+                )
+                thr_copy_r2g = tiled_copy_r2g.get_slice(tidx)
+
+                # auxiliary tensors
+                # [tileM]
+                gLabels = cute.local_tile(mLabels, (self.epi_tile[0],), (pidm,))
+
+                tLabelsCAcc = thr_copy_g2r.partition_S(cAcc)[(None, None, 0)]
+                tLabelsCAcc_mask = cute.make_fragment(tLabelsCAcc.shape, cutlass.Boolean)
+                # [(1, 1), 1]
+                tLabelsCAcc_mask[0] = cute.elem_less(pidm * self.epi_tile[0] + tidx, problem_mnk[0])
+                # to align shape with gMax and gAccu
+                tLabelsCAcc_mask = cute.append_ones(tLabelsCAcc_mask)
+
+                # [(1, 1), 1, 1]
+                tLabelsgLabels = thr_copy_g2r.partition_S(cute.append_ones(gLabels))
+                tLabelsrLabels = cute.make_fragment(
+                    tLabelsgLabels.shape, tLabelsgLabels.element_type
+                )
+                cute.copy(tiled_copy_g2r, tLabelsgLabels, tLabelsrLabels, pred=tLabelsCAcc_mask)
+                valid_mask: cutlass.Boolean = (
+                    tLabelsrLabels[0] != ignore_index
+                ) and tLabelsCAcc_mask[0]
+
+                # [tileM, 1]
+                gMax = cute.local_tile(mMax, (self.epi_tile[0], 1), (pidm, pidn))
+                # [(CPYM, CPYN), loopM, loopN]
+                tR2GgMax = thr_copy_r2g.partition_D(gMax)
+                tR2GrMax = cute.make_fragment(tR2GgMax.shape, tR2GgMax.element_type)
+                tR2GrMax.fill(-1e30)
+
+                # [tileM, 1]
+                gAccu = cute.local_tile(mAccu, (self.epi_tile[0], 1), (pidm, pidn))
+                # [(CPYM, CPYN), loopM, loopN]
+                tR2GgAccu = thr_copy_r2g.partition_D(gAccu)
+                tR2GrAccu = cute.make_fragment(tR2GgAccu.shape, tR2GgAccu.element_type)
+                tR2GrAccu.fill(0.0)
+
+                # [tileM, 1]
+                gLogprobs = cute.append_ones(
+                    cute.local_tile(mLogprobs, (self.epi_tile[0],), (pidm,))
+                )
+                # [(CPYM, CPYN), loopM, loopN]
+                tR2GgLogprobs = thr_copy_r2g.partition_D(gLogprobs)
+                tR2GrLogprobs = cute.make_fragment(tR2GgLogprobs.shape, tR2GgLogprobs.element_type)
+                tR2GrLogprobs.fill(0.0)
+
+                # [(tileN // num_epi_stage_per_tile, 1), 1, 1]
+                tTMEM_load_rAcc = cute.make_fragment(tTMEM_load_cAcc_shape, self.acc_dtype)
+
+                for n in cutlass.range(num_n_tiles):
+                    mma_pipeline.consumer_wait(mma_consumer_state)
+
+                    left: cutlass.Int64 = block_vocab_left_idx + n * self.epi_tile[1]
+                    right: cutlass.Int64 = min(
+                        (n + 1) * self.epi_tile[1] + block_vocab_left_idx, block_vocab_right_idx
+                    )
+                    num_n_subtiles: cutlass.Int64 = cute.ceil_div(
+                        (right - left), cute.size(tTMEM_load_rAcc, mode=[0])
+                    )
+                    for n_subtile in cutlass.range(num_n_subtiles):
+                        cute.copy(
+                            tiled_copy_t2r,
+                            tTMEM_load_tAcc[
+                                (None, None, None, n_subtile, mma_consumer_state.index)
+                            ],
+                            tTMEM_load_rAcc,
+                        )
+
+                        for idx in cutlass.range(
+                            cute.size(tTMEM_load_rAcc, mode=[0]), unroll_full=True
+                        ):
+                            local_position: cutlass.Int64 = (
+                                n * self.epi_tile[1]
+                                + n_subtile * cute.size(tTMEM_load_rAcc, mode=[0])
+                                + idx
+                            )
+                            if (block_vocab_left_idx + local_position) < block_vocab_right_idx:
+                                _max_old = tR2GrMax[0]
+                                tR2GrMax[0] = cute.arch.fmax(tR2GrMax[0], tTMEM_load_rAcc[idx])
+                                exp_logits = cute.exp(tTMEM_load_rAcc[idx] - tR2GrMax[0])
+                                coeff = cute.exp(_max_old - tR2GrMax[0])
+                                tR2GrAccu[0] = coeff * tR2GrAccu[0] + exp_logits
+
+                                position: cutlass.Int64 = (
+                                    rank * problem_mnk[1]
+                                    + pidn * self.vocab_per_split
+                                    + local_position
+                                )
+                                mask: cutlass.Boolean = valid_mask and (
+                                    position == tLabelsrLabels[0]
+                                )
+                                tR2GrLogprobs[0] += mask * tTMEM_load_rAcc[idx]
+
+                    mma_pipeline.consumer_release(mma_consumer_state)
+                    mma_consumer_state.advance()
+
+                cute.copy(tiled_copy_r2g, tR2GrMax, tR2GgMax, pred=tLabelsCAcc_mask)
+                cute.copy(tiled_copy_r2g, tR2GrAccu, tR2GgAccu, pred=tLabelsCAcc_mask)
+
+                vocab_left_idx: cutlass.Int64 = rank * problem_mnk[1] + pidn * self.vocab_per_split
+                vocab_right_idx: cutlass.Int64 = rank * problem_mnk[1] + min(
+                    (pidn + 1) * self.vocab_per_split, problem_mnk[1]
+                )
+                valid: cutlass.Boolean = (
+                    tLabelsrLabels[0] >= vocab_left_idx and tLabelsrLabels[0] < vocab_right_idx
+                )
+                tLabelsCAcc_mask[0] &= valid
+
+                cute.copy(tiled_copy_r2g, tR2GrLogprobs, tR2GgLogprobs, pred=tLabelsCAcc_mask)
+
+            # Dealloc TMEM
+            self.cta_sync_barrier.arrive_and_wait()
+            if warp_idx == self.empty_warp_ids[0]:
+                cute.arch.relinquish_tmem_alloc_permit()
+                cute.arch.dealloc_tmem(
+                    tmem_ptr, self.tmem_alloc_cols, is_two_cta=self.use_2cta_instrs
+                )
+
+        @staticmethod
+        def _compute_grid(
+            problem_mnk: Tuple[int, int, int],
+            cluster_shape_mn: Tuple[int, int],
+            cta_tiler: Tuple[int, int, int],
+            num_splits: int,
+        ) -> Tuple[int, int, int]:
+
+            cluster_shape = (*cluster_shape_mn, 1)
+
+            grid = cute.round_up(
+                (cute.ceil_div(problem_mnk[0], cta_tiler[0]), num_splits, 1), cluster_shape
+            )
+            return grid
+
+        @cute.jit
+        def __call__(
+            self,
+            hidden: cute.Tensor,
+            weight: cute.Tensor,
+            labels: cute.Tensor,
+            _logprobs: cute.Tensor,
+            _max: cute.Tensor,
+            _accu: cute.Tensor,
+            ignore_index: cutlass.Int64,
+            rank: cutlass.Int32,
+            stream: cuda.CUstream,
+        ) -> None:
+            a_dtype: Type[cutlass.Numeric] = hidden.element_type
+            b_dtype: Type[cutlass.Numeric] = weight.element_type
+
+            if cutlass.const_expr(hidden.element_type != weight.element_type):
+                raise RuntimeError(
+                    f"data type don't match: {hidden.element_type} v.s. {weight.element_type}"
+                )
+            if cutlass.const_expr(hidden.element_type not in [cutlass.Float16, cutlass.BFloat16]):
+                raise RuntimeError("hidden can only be FP16 or BF16")
+            if cutlass.const_expr(hidden.layout.shape[1] != weight.layout.shape[1]):
+                raise RuntimeError("K dimension doesn't match")
+
+            problem_mnk = (hidden.layout.shape[0], weight.layout.shape[0], hidden.layout.shape[1])
+            if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 16 != 0):
+                raise RuntimeError(f"K dimension is not 16B aligned: {problem_mnk[2]}")
+
+            num_splits = cute.ceil_div(problem_mnk[1], self.vocab_per_split)
+
+            grid = self._compute_grid(
+                problem_mnk=problem_mnk,
+                cluster_shape_mn=self.cluster_shape_mn,
+                cta_tiler=self.cta_tiler,
+                num_splits=num_splits,
+            )
+            a_major_mode = utils.LayoutEnum.from_tensor(hidden).mma_major_mode()
+            b_major_mode = utils.LayoutEnum.from_tensor(weight).mma_major_mode()
+
+            tiled_mma = sm100_utils.make_trivial_tiled_mma(
+                a_dtype,
+                a_major_mode,
+                b_major_mode,
+                self.acc_dtype,
+                self.cta_group,
+                self.mma_tiler[:2],
+            )
+
+            self._setup_attributes(tiled_mma, a_dtype, b_dtype)
+            if cutlass.const_expr((problem_mnk[2] * a_dtype.width // 8) % 128 != 0):
+                raise RuntimeError(f"K dimension is not 128B aligned: {problem_mnk[2]}")
+
+            self.epi_tile = self.mma_tiler[:2]
+
+            # Swizzle o [(tileM, tileK), loopM, loopK, stage]
+            a_smem_layout_staged = sm100_utils.make_smem_layout_a(
+                tiled_mma, self.mma_tiler, a_dtype, self.num_a_stage
+            )
+            # Swizzle o [(tileN, tileK), loopN, loopK, stage]
+            b_smem_layout_staged = sm100_utils.make_smem_layout_b(
+                tiled_mma, self.mma_tiler, b_dtype, self.num_b_stage
+            )
+
+            # TMA loading
+            tma_load_op = cpasync.CopyBulkTensorTileG2SOp(self.cta_group)
+            tma_store_op = cpasync.CopyBulkTensorTileS2GOp()
+
+            # Swizzle o [(tileM, tileK), loopM, loopK]
+            a_smem_layout = cute.select(a_smem_layout_staged, mode=[0, 1, 2])
+            # create tma copy atom for hidden,
+            # and the cooresponding tma descriptor tensor
+            tma_atom_a, tma_desc_a = cute.nvgpu.make_tiled_tma_atom_A(
+                tma_load_op,
+                hidden,  # gmem_tensor
+                a_smem_layout,  # SMEM layout
+                self.mma_tiler,  # MMA tiler
+                tiled_mma,  # TiledMMA
+                self.cluster_layout_vmnk.shape,  # cluster_shape_vmnk
+            )
+            # Swizzle o [(tileN, tileK), loopN, loopK]
+            b_smem_layout = cute.select(b_smem_layout_staged, mode=[0, 1, 2])
+            tma_atom_b, tma_desc_b = cute.nvgpu.make_tiled_tma_atom_B(
+                tma_load_op,
+                weight,  # gmem_tensor
+                b_smem_layout,  # SMEM layout
+                self.mma_tiler,  # MMA tiler
+                tiled_mma,  # TiledMMA
+                self.cluster_layout_vmnk.shape,  # cluster_shape_vmnk
+            )
+            a_copy_size = cute.size_in_bytes(a_dtype, a_smem_layout)
+            b_copy_size = cute.size_in_bytes(b_dtype, b_smem_layout)
+            self.tma_copy_a_bytes = a_copy_size
+            self.tma_copy_b_bytes = b_copy_size
+
+            assert self.num_a_stage == self.num_b_stage
+
+            @cute.struct
+            class SharedStorage:
+                """
+                The shared storage for the forward kernel.
+                """
+
+                # pipeline barriers, 2 = producer + consumer
+                load_ab_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_a_stage * 2]
+                mma_mbar_ptr: cute.struct.MemRange[cutlass.Int64, self.num_acc_stage * 2]
+                tmem_dealloc_mbar_ptr: cute.struct.MemRange[cutlass.Int64, 1]
+                # tmem holding buffer
+                tmem_holding_buf: cutlass.Int32
+                # SMEM tensors
+                sA: cute.struct.Align[
+                    cute.struct.MemRange[a_dtype, cute.cosize(a_smem_layout_staged)],
+                    self.buffer_align_bytes,
+                ]
+                sB: cute.struct.Align[
+                    cute.struct.MemRange[b_dtype, cute.cosize(b_smem_layout_staged)],
+                    self.buffer_align_bytes,
+                ]
+
+            self.shared_storage = SharedStorage
+
+            # launch kernel
+            self.kernel(
+                tiled_mma,
+                tma_atom_a,
+                tma_desc_a,
+                tma_atom_b,
+                tma_desc_b,
+                labels,
+                _max,
+                _accu,
+                _logprobs,
+                a_smem_layout_staged,
+                b_smem_layout_staged,
+                self.cluster_layout_vmnk,
+                problem_mnk,
+                ignore_index,
+                rank,
+            ).launch(
+                grid=grid,
+                block=[self.threads_per_cta, 1, 1],
+                cluster=self.cluster_shape_mnk,
+                stream=stream,
+            )
+            return None
+
+except ImportError:
+    logging.warning("Cutlass or CUDA Python bindings not found. FwdMainLoop will not be available.")
diff --git a/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
new file mode 100644
index 00000000000..e025cc046f4
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/blackwell/triton.py
@@ -0,0 +1,248 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+import triton  # type: ignore
+import triton.language as tl  # type: ignore
+
+# NOTE: tl.pointer_type() is not available in Triton 3.3.0
+
+
+@triton.autotune(
+    configs=[
+        triton.Config({"BLOCK_SIZE_M": 1024}, num_stages=3, num_warps=32),
+        triton.Config({"BLOCK_SIZE_M": 2048}, num_stages=3, num_warps=32),
+    ],
+    key=["num_tokens"],
+)
+@triton.jit
+def get_num_valid_tokens(
+    num_tokens: tl.int64,
+    ignore_index: tl.int64,
+    labels_ptr,  #: tl.pointer_type(tl.int64),
+    stride_labels: tl.int64,
+    num_valid_tokens_ptr,  #: tl.pointer_type(tl.int64),
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """
+    Calculate the number of valid tokens in the labels tensor.
+    """
+    num_pid_m: tl.int64 = tl.cdiv(num_tokens, BLOCK_SIZE_M)
+
+    num_valid_tokens: tl.int64 = tl.zeros((), dtype=tl.int64)
+    for m in range(0, num_pid_m):
+        offs_am = m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+
+        labels = tl.load(
+            labels_ptr + offs_am * stride_labels, mask=offs_am < num_tokens, other=ignore_index
+        )
+
+        valid_labels_mask = labels != ignore_index
+        num_valid_tokens += (tl.sum(valid_labels_mask.to(tl.int32), axis=0)).to(tl.int64)
+    tl.store(num_valid_tokens_ptr, num_valid_tokens)
+
+
+@triton.autotune(
+    configs=[triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64})],
+    key=["num_tokens", "num_splits"],
+)
+@triton.jit
+def forward_dp_epilogue(
+    num_tokens: tl.int64,
+    num_splits: tl.int64,  # TODO: maybe this could be a constexpr
+    ignore_index: tl.int64,
+    labels_ptr,  #: tl.pointer_type(tl.int64),
+    stride_labels: tl.int64,
+    num_valid_tokens_ptr,  #: tl.pointer_type(tl.int64),
+    max_ptr,  #: tl.pointer_type(tl.float32),
+    stride_max_m: tl.int64,
+    stride_max_n: tl.int64,
+    accu_ptr,  #: tl.pointer_type(tl.float32),
+    stride_accu_m: tl.int64,
+    stride_accu_n: tl.int64,
+    global_max_ptr,  #: tl.pointer_type(tl.float32),
+    stride_global_max: tl.int64,
+    global_accu_ptr,  #: tl.pointer_type(tl.float32),
+    stride_global_accu: tl.int64,
+    global_logprobs_ptr,  #: tl.pointer_type(tl.float32),
+    stride_global_logprobs: tl.int64,
+    global_logprobs_scalar_ptr,  #: tl.pointer_type(tl.float32),
+    REDUCTION: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    forward epilogue in dp
+    """
+    pid_m = tl.program_id(axis=0)
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    global_max = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    global_accu = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+
+    for pid_n in range(0, tl.cdiv(num_splits, BLOCK_SIZE_N)):
+        offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+        _max = tl.load(
+            max_ptr + offs_m[:, None] * stride_max_m + offs_n[None, :] * stride_max_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+        _accu = tl.load(
+            accu_ptr + offs_m[:, None] * stride_accu_m + offs_n[None, :] * stride_accu_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+
+        # local reduction
+        _max_old = global_max
+        _local_max = tl.max(_max, axis=1, return_indices=False)
+        global_max = tl.maximum(global_max, _local_max)
+
+        _scale = tl.exp(_max - global_max[:, None])
+        _coeff = tl.exp(_max_old - global_max)
+        global_accu = _coeff * global_accu + tl.sum(_scale * _accu, axis=1)
+
+    # store maximum
+    tl.store(global_max_ptr + offs_m * stride_global_max, global_max, mask=offs_m < num_tokens)
+    # store accumulate
+    tl.store(global_accu_ptr + offs_m * stride_global_accu, global_accu, mask=offs_m < num_tokens)
+    # update logprobs
+    labels = tl.load(
+        labels_ptr + offs_m * stride_labels, mask=offs_m < num_tokens, other=ignore_index
+    )
+    global_logprobs_ptrs = global_logprobs_ptr + offs_m * stride_global_logprobs
+    global_logprobs = tl.load(global_logprobs_ptrs, mask=offs_m < num_tokens)
+    global_logprobs = global_max + tl.log(global_accu) - global_logprobs
+    label_mask = labels != ignore_index
+    global_logprobs = tl.where(label_mask, global_logprobs, 0.0)
+
+    if REDUCTION == 0:  # no-reduction
+        tl.store(global_logprobs_ptrs, global_logprobs, mask=offs_m < num_tokens)
+    elif REDUCTION == 1:  # sum
+        global_logprobs_scalar = tl.sum(global_logprobs, axis=0)
+        tl.atomic_add(global_logprobs_scalar_ptr, global_logprobs_scalar)
+    elif REDUCTION == 2:  # mean
+        num_valid_tokens = tl.load(num_valid_tokens_ptr)
+        global_logprobs_scalar = tl.fdiv(
+            tl.sum(global_logprobs, axis=0), num_valid_tokens.to(tl.float32)
+        )
+        tl.atomic_add(global_logprobs_scalar_ptr, global_logprobs_scalar)
+
+
+@triton.autotune(
+    configs=[triton.Config({"BLOCK_SIZE_M": 16, "BLOCK_SIZE_N": 64})],
+    key=["num_tokens", "num_splits"],
+)
+@triton.jit
+def forward_tp_epilogue(
+    num_tokens: tl.int64,
+    num_splits: tl.int64,
+    reduced_max_ptr,  #: tl.pointer_type(tl.float32),
+    stride_reduced_max_m: tl.int64,
+    stride_reduced_max_n: tl.int64,
+    original_max_ptr,  #: tl.pointer_type(tl.float32),
+    stride_original_max_m: tl.int64,
+    stride_original_max_n: tl.int64,
+    accu_ptr,  #: tl.pointer_type(tl.float32),
+    stride_accu_m: tl.int64,
+    stride_accu_n: tl.int64,
+    global_max_ptr,  #: tl.pointer_type(tl.float32),
+    stride_global_max: tl.int64,
+    global_accu_ptr,  #: tl.pointer_type(tl.float32),
+    stride_global_accu: tl.int64,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+):
+    """
+    forward epilogue in tp
+    """
+    pid_m = tl.program_id(axis=0)
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+
+    global_max = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+    global_accu = tl.zeros((BLOCK_SIZE_M,), dtype=tl.float32)
+
+    for pid_n in range(0, tl.cdiv(num_splits, BLOCK_SIZE_N)):
+        offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+
+        _reduced_max = tl.load(
+            reduced_max_ptr
+            + offs_m[:, None] * stride_reduced_max_m
+            + offs_n[None, :] * stride_reduced_max_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+        _original_max = tl.load(
+            original_max_ptr
+            + offs_m[:, None] * stride_original_max_m
+            + offs_n[None, :] * stride_original_max_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+        _accu = tl.load(
+            accu_ptr + offs_m[:, None] * stride_accu_m + offs_n[None, :] * stride_accu_n,
+            mask=(offs_m[:, None] < num_tokens) & (offs_n[None, :] < num_splits),
+            other=0.0,
+        )
+
+        # local reduction
+        _max_old = global_max
+        _local_max = tl.max(_reduced_max, axis=1)
+        global_max = tl.maximum(global_max, _local_max)
+
+        # update accumulate
+        _coeff = tl.exp(_max_old - global_max)
+        _scale = tl.exp(_original_max - global_max[:, None])
+        global_accu = _coeff * global_accu + tl.sum(_scale * _accu, axis=1)
+
+    # store
+    tl.store(global_max_ptr + offs_m * stride_global_max, global_max, mask=offs_m < num_tokens)
+    tl.store(global_accu_ptr + offs_m * stride_global_accu, global_accu, mask=offs_m < num_tokens)
+
+
+@triton.autotune(configs=[triton.Config({"BLOCK_SIZE_M": 16})], key=["num_tokens"])
+@triton.jit
+def forward_tp_epilogue_update_logprobs(
+    num_tokens: tl.int64,
+    ignore_index: tl.int64,
+    num_valid_tokens_ptr,  #: tl.pointer_type(tl.int64),
+    labels_ptr,  #: tl.pointer_type(tl.int64),
+    stride_labels: tl.int64,
+    logprobs_ptr,  #: tl.pointer_type(tl.float32),
+    stride_logprobs: tl.int64,
+    maximum_ptr,  #: tl.pointer_type(tl.float32),
+    stride_maximum: tl.int64,
+    accumulate_ptr,  #: tl.pointer_type(tl.float32),
+    stride_accumulate: tl.int64,
+    logprobs_scalar_ptr,  #: tl.pointer_type(tl.float32),
+    REDUCTION: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+):
+    """
+    update logprobs in tp
+    """
+    pid_m = tl.program_id(axis=0)
+
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+
+    logprobs = tl.load(logprobs_ptr + offs_m * stride_logprobs, mask=offs_m < num_tokens)
+    maximum = tl.load(maximum_ptr + offs_m * stride_maximum, mask=offs_m < num_tokens)
+    accumulate = tl.load(accumulate_ptr + offs_m * stride_accumulate, mask=offs_m < num_tokens)
+
+    labels = tl.load(
+        labels_ptr + offs_m * stride_labels, mask=offs_m < num_tokens, other=ignore_index
+    )
+    label_mask = labels != ignore_index
+
+    logprobs = maximum + tl.log(accumulate) - logprobs
+    logprobs = tl.where(label_mask, logprobs, 0.0)
+
+    if REDUCTION == 0:  # no-reduction
+        tl.store(logprobs_ptr + offs_m * stride_logprobs, logprobs, mask=offs_m < num_tokens)
+    elif REDUCTION == 1:  # sum
+        logprobs_scalar = tl.sum(logprobs, axis=0)
+        tl.atomic_add(logprobs_scalar_ptr, logprobs_scalar)
+    elif REDUCTION == 2:  # mean
+        num_valid_tokens = tl.load(num_valid_tokens_ptr)
+        logprobs_scalar = tl.fdiv(tl.sum(logprobs, axis=0), num_valid_tokens.to(tl.float32))
+        tl.atomic_add(logprobs_scalar_ptr, logprobs_scalar)
diff --git a/megatron/core/fusions/linear_cross_entropy/utils.py b/megatron/core/fusions/linear_cross_entropy/utils.py
new file mode 100644
index 00000000000..d077d64ab17
--- /dev/null
+++ b/megatron/core/fusions/linear_cross_entropy/utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+import typing
+from enum import Enum
+
+
+class EntropyReductionEnum(Enum):
+    """
+    Enum for the reduction method of cross entropy.
+    """
+
+    kNone = 0
+    kSum = 1
+    kMean = 2
+
+
+def str_to_reduction_enum(reduction: typing.Literal["none", "sum", "mean"]) -> EntropyReductionEnum:
+    """
+    str -> EntropyReductionEnum
+    """
+    _enum = EntropyReductionEnum.kNone
+    if reduction == "none":
+        _enum = EntropyReductionEnum.kNone
+    elif reduction == "sum":
+        _enum = EntropyReductionEnum.kSum
+    elif reduction == "mean":
+        _enum = EntropyReductionEnum.kMean
+    else:
+        raise ValueError(f"Invalid reduction: {reduction}")
+    return _enum
+
+
+class BackwardMethodEnum(Enum):
+    """
+    Enum for the backward method of linear cross entropy.
+    """
+
+    # two separate kernels for d_hidden and d_weight, respectively
+    kTwoKernels = 0
+    # calculate partial d_logits along its N dimension
+    kDlogitsSplitN = 1
+    # fuse d_hidden and d_weight into a single kernel
+    kFused = 2
diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py
index 5b264b36302..fe053522c62 100644
--- a/megatron/core/inference/contexts/dynamic_context.py
+++ b/megatron/core/inference/contexts/dynamic_context.py
@@ -1263,6 +1263,7 @@ def apply_rotary_emb_query(
             cu_seqlens=cu_seqlens_q,
             cp_group=cp_group,
             mscale=mscale,
+            mla_rotary_interleaved=config.multi_latent_attention,
         )
         return query
 
@@ -1297,11 +1298,21 @@ def apply_rotary_emb_key(
                     f"paused_request_count={self.paused_request_count}"
                 )
             key = apply_rotary_pos_emb(
-                t=key[:n], freqs=key_emb[:n], config=config, cp_group=cp_group, mscale=mscale
+                t=key[:n],
+                freqs=key_emb[:n],
+                config=config,
+                cp_group=cp_group,
+                mscale=mscale,
+                mla_rotary_interleaved=config.multi_latent_attention,
             )
         else:
             key[:n] = apply_rotary_pos_emb(
-                t=key[:n], freqs=key_emb[:n], config=config, cp_group=cp_group, mscale=mscale
+                t=key[:n],
+                freqs=key_emb[:n],
+                config=config,
+                cp_group=cp_group,
+                mscale=mscale,
+                mla_rotary_interleaved=config.multi_latent_attention,
             )
         return key
 
diff --git a/megatron/core/model_parallel_config.py b/megatron/core/model_parallel_config.py
index dabe0d0aced..d56c6282e0c 100644
--- a/megatron/core/model_parallel_config.py
+++ b/megatron/core/model_parallel_config.py
@@ -6,8 +6,11 @@
 
 import torch
 
+from megatron.core.utils import experimental_api
+
 
 @dataclass
+@experimental_api
 class ModelParallelConfig:
     """Base configuration for Megatron Core
 
@@ -59,14 +62,29 @@ class ModelParallelConfig:
     can handle without overflowing the memory. Typically, a good starting point is to set this
     to maximum sequence length / context parallel size.
     This is used to calculate the number and length of sub-samples assigned to 
-    each rank when using hybrid_context_parallel.
+    each rank when sequence_packing_scheduler is not None.
     """
 
-    hybrid_context_parallel: bool = False
+    dynamic_context_parallel: bool = False
     """
-    If true, enables hybrid context parallel. This is used to balance the workload of 
+    If true, enables dynamic context parallel. This is used to balance the workload of 
     each CP rank when we use packed samples with variable sequence lengths.
-    Please set max_seqlen_per_dp_cp_rank when using hybrid_context_parallel.
+    Dynamic CP forms variable-sized CP groups from the DPxCP ranks dynamically.
+    Please set max_seqlen_per_dp_cp_rank.
+    """
+
+    min_dynamic_context_parallel_size: int = 1
+    """Minimum CP group size for dynamic context parallel. Default 1 (no CP).
+    The maximum is dp_size * context_parallel_size (the full DPxCP group)."""
+
+    hybrid_context_parallel: bool = False
+    """Deprecated. Use ``dynamic_context_parallel`` instead."""
+
+    sequence_packing_scheduler: Optional[Literal['dp_balanced', 'default_dynamic_cp']] = None
+    """
+    Scheduler for sequence packing and dynamic context parallel.
+    dp_balanced: DP-balanced scheduler for sequence packing.
+    default_dynamic_cp: Dynamic-CP scheduler for packed sequence balancing.
     """
 
     expert_model_parallel_size: int = 1
@@ -235,9 +253,14 @@ class ModelParallelConfig:
        Defaults to False.
     """
 
-    cross_entropy_fusion_impl: Literal['native', 'te'] = 'native'
-    """If 'native', MCore based CE loss fusion is used, if 'te', Parallel CE loss
-       from Transformer Engine library is used. Defaults to 'native'.
+    cross_entropy_fusion_impl: Literal['native', 'te', 'linear'] = 'native'
+    """
+    Specifies the implementation of cross-entropy loss fusion.
+
+    Options:
+    - 'native': Uses MCore-based cross-entropy loss fusion (default).
+    - 'te': Uses the parallel cross-entropy loss implementation from the Transformer Engine library.
+    - 'linear': Uses a linear-cross-entropy fusion approach.
     """
 
     tp_comm_overlap_disable_qkv: bool = False
@@ -412,6 +435,34 @@ def __post_init__(self):
         See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
         details.
         """
+        if self.hybrid_context_parallel:
+            warnings.warn(
+                "hybrid_context_parallel is deprecated and will be removed in a future release. "
+                "Use dynamic_context_parallel instead.",
+                DeprecationWarning,
+            )
+            if self.dynamic_context_parallel:
+                raise ValueError(
+                    "Cannot set both hybrid_context_parallel and dynamic_context_parallel. "
+                    "Please use dynamic_context_parallel only."
+                )
+            self.dynamic_context_parallel = True
+
+        if self.dynamic_context_parallel:
+            if self.sequence_packing_scheduler is None:
+                self.sequence_packing_scheduler = 'default_dynamic_cp'
+            if self.sequence_packing_scheduler != 'default_dynamic_cp':
+                raise ValueError(
+                    'Dynamic context parallelism requires '
+                    'sequence_packing_scheduler=default_dynamic_cp'
+                )
+
+            if self.min_dynamic_context_parallel_size < 1:
+                raise ValueError(
+                    f"min_dynamic_context_parallel_size must be >= 1, "
+                    f"got {self.min_dynamic_context_parallel_size}"
+                )
+
         if self.sequence_parallel:
             if self.tensor_model_parallel_size <= 1:
                 raise ValueError("Cannot use sequence parallelism without tensor parallelism")
diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py
index 2fd19194813..c97f738771b 100644
--- a/megatron/core/models/common/embeddings/rope_utils.py
+++ b/megatron/core/models/common/embeddings/rope_utils.py
@@ -93,8 +93,11 @@ def _apply_rotary_pos_emb_bshd(
     t: Tensor,
     freqs: Tensor,
     rotary_interleaved: bool = False,
-    multi_latent_attention: bool = False,
+    mla_rotary_interleaved: bool = False,
     mscale: float = 1.0,
+    inverse: bool = False,
+    mla_output_remove_interleaving: bool = False,
+    multi_latent_attention: Optional[bool] = None,
 ) -> Tensor:
     """Apply rotary positional embedding to input tensor T.
 
@@ -103,16 +106,33 @@ def _apply_rotary_pos_emb_bshd(
     Args:
         t (Tensor): Input tensor T is of shape [seq_length, ... , dim]
         freqs (Tensor): Rotary Positional embedding tensor freq is of shape [seq_length, ..., dim]
+        rotary_interleaved (bool): Whether to apply interleaving in the rotate half function.
+        mla_rotary_interleaved (bool): Whether to apply MLA-style interleaving for RoPE.
+        mscale (float): The scaling factor for the RoPE.
 
     Returns:
         Tensor: The input tensor after applying RoPE
     """
+    if multi_latent_attention is not None:
+        warnings.warn(
+            "multi_latent_attention is deprecated. Please use mla_rotary_interleaved instead.",
+            DeprecationWarning,
+        )
+        mla_rotary_interleaved = multi_latent_attention
+
+    # Some callers may pass freqs with an extra singleton axis, e.g.
+    # t: [s, b, d] and freqs: [s, 1, 1, d]. In that case, broadcasting would
+    # accidentally expand to [s, s, b, d]. Squeeze the extra singleton axis to
+    # keep freqs rank aligned with t.
+    if freqs.dim() == t.dim() + 1 and freqs.size(-2) == 1:
+        freqs = freqs.squeeze(-2)
+
     rot_dim = freqs.shape[-1]
 
     # ideally t_pass is empty so rotary pos embedding is applied to all tensor t
     t, t_pass = t[..., :rot_dim], t[..., rot_dim:]
 
-    if multi_latent_attention:
+    if mla_rotary_interleaved:
         x1 = t[..., 0::2]
         x2 = t[..., 1::2]
         t = torch.cat((x1, x2), dim=-1)
@@ -121,8 +141,18 @@ def _apply_rotary_pos_emb_bshd(
     # second part is sine component, need to change signs with _rotate_half method
     cos_ = (torch.cos(freqs) * mscale).to(t.dtype)
     sin_ = (torch.sin(freqs) * mscale).to(t.dtype)
+    if inverse:
+        sin_ = -sin_
 
     t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
+
+    # Fallback to original permutation
+    # DSv4 applies rope on V and O, so we need to uninterleave the tensor.
+    # The existing MLA code is safe because the dot product is permutation-invariant.
+    if mla_rotary_interleaved and mla_output_remove_interleaving:
+        x1, x2 = torch.chunk(t, 2, dim=-1)
+        t = torch.stack((x1, x2), dim=-1).flatten(start_dim=-2)
+
     return torch.cat((t, t_pass), dim=-1)
 
 
@@ -180,9 +210,12 @@ def _apply_rotary_pos_emb_thd(
     cu_seqlens: Tensor,
     freqs: Tensor,
     rotary_interleaved: bool = False,
-    multi_latent_attention: bool = False,
+    mla_rotary_interleaved: bool = False,
     mscale: float = 1.0,
+    inverse: bool = False,
+    mla_output_remove_interleaving: bool = False,
     cp_group: torch.distributed.ProcessGroup = None,
+    multi_latent_attention: Optional[bool] = None,
 ) -> Tensor:
     """A baseline implementation of applying RoPE for `thd` format.
 
@@ -196,6 +229,12 @@ def _apply_rotary_pos_emb_thd(
     Returns:
         Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
     """
+    if multi_latent_attention is not None:
+        warnings.warn(
+            "multi_latent_attention is deprecated. Please use mla_rotary_interleaved instead.",
+            DeprecationWarning,
+        )
+        mla_rotary_interleaved = multi_latent_attention
 
     if cp_group is None:
         raise ValueError("cp_group must be provided for THD format RoPE")
@@ -226,8 +265,10 @@ def _apply_rotary_pos_emb_thd(
             t.unsqueeze(1),
             freqs_packed,
             rotary_interleaved=rotary_interleaved,
-            multi_latent_attention=multi_latent_attention,
+            mla_rotary_interleaved=mla_rotary_interleaved,
             mscale=mscale,
+            inverse=inverse,
+            mla_output_remove_interleaving=mla_output_remove_interleaving,
         ).squeeze(1)
     else:
         # CASE 2: Traditional mapping without offsets
@@ -242,8 +283,10 @@ def _apply_rotary_pos_emb_thd(
             t.unsqueeze(1),
             freqs_packed,
             rotary_interleaved=rotary_interleaved,
-            multi_latent_attention=multi_latent_attention,
+            mla_rotary_interleaved=mla_rotary_interleaved,
             mscale=mscale,
+            inverse=inverse,
+            mla_output_remove_interleaving=mla_output_remove_interleaving,
         ).squeeze(1)
 
 
@@ -254,6 +297,9 @@ def apply_rotary_pos_emb(
     cu_seqlens: Optional[Tensor] = None,
     mscale: float = 1.0,
     cp_group: torch.distributed.ProcessGroup = None,
+    mla_rotary_interleaved: bool = False,
+    inverse: bool = False,
+    mla_output_remove_interleaving: bool = False,
 ):
     """
     Reroute to the appropriate apply_rotary_pos_emb function depending on
@@ -282,6 +328,18 @@ def apply_rotary_pos_emb(
                     "Using unfused implementation."
                 )
                 use_unfused = True
+            if mla_rotary_interleaved:
+                warnings.warn(
+                    "apply_rope_fusion does not support MLA-style interleaving in RoPE."
+                    "Using unfused implementation."
+                )
+                use_unfused = True
+            if inverse:
+                warnings.warn(
+                    "inverse RoPE is not supported by TE's fused RoPE. "
+                    "Using unfused implementation."
+                )
+                use_unfused = True
             if not use_unfused:
                 assert fused_apply_rotary_pos_emb is not None, "apply_rope_fusion is not available."
                 return fused_apply_rotary_pos_emb(t, freqs, interleaved=config.rotary_interleaved)
@@ -301,8 +359,10 @@ def apply_rotary_pos_emb(
             t,
             freqs,
             rotary_interleaved=config.rotary_interleaved,
-            multi_latent_attention=config.multi_latent_attention,
+            mla_rotary_interleaved=mla_rotary_interleaved,
             mscale=mscale,
+            inverse=inverse,
+            mla_output_remove_interleaving=mla_output_remove_interleaving,
         )
     else:
         return _apply_rotary_pos_emb_thd(
@@ -310,9 +370,11 @@ def apply_rotary_pos_emb(
             cu_seqlens,
             freqs,
             rotary_interleaved=config.rotary_interleaved,
-            multi_latent_attention=config.multi_latent_attention,
+            mla_rotary_interleaved=mla_rotary_interleaved,
             mscale=mscale,
             cp_group=cp_group,
+            inverse=inverse,
+            mla_output_remove_interleaving=mla_output_remove_interleaving,
         )
 
 
@@ -339,7 +401,7 @@ def apply_rotary_pos_emb_with_cos_sin(
             t,
             freqs,
             rotary_interleaved=rotary_interleaved,
-            multi_latent_attention=False,
+            mla_rotary_interleaved=False,
             mscale=1.0,
         )
     else:
diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
index 166ef9b41e7..bc5a9c5fa3f 100644
--- a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
+++ b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
@@ -186,13 +186,13 @@ def forward(
             emb = get_pos_emb_on_this_cp_rank(emb, 0, cp_group)
         return emb, _mscale
 
-    def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq=False):
+    def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq=False, cp_group=None):
         self.max_seq_len_cached = seq_len
         self.offset_cached = offset
         self.dtype_cached = dtype
         self.packed_seq_cached = packed_seq
 
-        emb, _mscale = self.forward(seq_len, offset, packed_seq)
+        emb, _mscale = self.forward(seq_len, offset, packed_seq=packed_seq, cp_group=cp_group)
         self.register_buffer(
             "cos_cached", (emb.cos() * _mscale).to(dtype).contiguous(), persistent=False
         )
@@ -201,7 +201,7 @@ def _set_cos_sin_cache(self, seq_len, offset, dtype, packed_seq=False):
         )
 
     def get_cached_cos_sin(
-        self, seq_len, offset=0, dtype=torch.get_default_dtype(), packed_seq=False
+        self, seq_len, offset=0, dtype=torch.get_default_dtype(), packed_seq=False, cp_group=None
     ):
         """Get cached cos and sin values."""
         if (
@@ -210,7 +210,7 @@ def get_cached_cos_sin(
             or dtype != self.dtype_cached
             or packed_seq != self.packed_seq_cached
         ):
-            self._set_cos_sin_cache(seq_len, offset, dtype, packed_seq)
+            self._set_cos_sin_cache(seq_len, offset, dtype, packed_seq, cp_group)
         return (self.cos_cached[:seq_len, ...], self.sin_cached[:seq_len, ...])
 
 
diff --git a/megatron/core/models/common/language_module/language_module.py b/megatron/core/models/common/language_module/language_module.py
index e8bb564e759..10261686eef 100644
--- a/megatron/core/models/common/language_module/language_module.py
+++ b/megatron/core/models/common/language_module/language_module.py
@@ -420,7 +420,7 @@ def tie_embeddings_and_output_weights_state_dict(
         sharded_state_dict: ShardedStateDict,
         output_layer_weight_key: str,
         first_stage_word_emb_key: str,
-        metadata: dict = {},
+        metadata: Optional[dict] = None,
     ) -> None:
         """Ties the embedding and output weights in a given sharded state dict.
 
@@ -430,9 +430,11 @@ def tie_embeddings_and_output_weights_state_dict(
                 This entry will be replaced with a tied version
             first_stage_word_emb_key (str): this must be the same as the
                 ShardedTensor.key of the first stage word embeddings.
+            metadata (Optional[Dict]): metadata controlling sharded state dict creation.
 
         Returns: None, acts in-place
         """
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
         if not self.post_process:
             # No output layer
             assert output_layer_weight_key not in sharded_state_dict, sharded_state_dict.keys()
diff --git a/megatron/core/models/common/model_chunk_schedule_plan.py b/megatron/core/models/common/model_chunk_schedule_plan.py
index 9032d337e00..24802157809 100644
--- a/megatron/core/models/common/model_chunk_schedule_plan.py
+++ b/megatron/core/models/common/model_chunk_schedule_plan.py
@@ -14,6 +14,7 @@
     get_comm_stream,
     get_comp_stream,
 )
+from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.utils import nvtx_range_pop, nvtx_range_push
 
 
@@ -173,6 +174,11 @@ def create_node(stream, module, name):
         else:
             self.mtp_post_process = NoopScheduleNode()
 
+        # mlp and combine may receive dgrad from attn, which is managed by cuda graph.
+        if CudaGraphScope.attn in self.config.cuda_graph_scope:
+            self.mlp.manual_grads_release = False
+            self.moe_combine.manual_grads_release = False
+
     def get_fp8_context(self):
         """
         Get the fp8 context for the transformer layer.
@@ -241,11 +247,14 @@ def run(f_layer, b_layer, f_input=None, b_grad=None, is_last_layer_in_bwd=False)
         if f_layer is not None:
             with f_layer.get_fp8_context():
                 f_input = f_layer.moe_combine.forward(f_input)
-                f_input = f_layer.mtp_post_process.forward(f_input)
 
         if b_layer is not None and not b_layer.config.ep_overlap_early_attn_memory_release:
             b_grad = b_layer.attn.backward(b_grad)
 
+        if f_layer is not None:
+            with f_layer.get_fp8_context():
+                f_input = f_layer.mtp_post_process.forward(f_input)
+
         # Delay the last attn_dw in backward pass (attn_dw of the first layer)
         # for overlapping with the p2p comm
         if b_layer is not None and not is_last_layer_in_bwd:
diff --git a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py
index 1b03b935639..329b8f259ea 100644
--- a/megatron/core/models/gpt/experimental_attention_variant_module_specs.py
+++ b/megatron/core/models/gpt/experimental_attention_variant_module_specs.py
@@ -6,12 +6,25 @@
 from megatron.core.models.backends import BackendSpecProvider
 from megatron.core.ssm.gated_delta_net import GatedDeltaNet, GatedDeltaNetSubmodules
 from megatron.core.transformer.enums import AttnMaskType, LayerType
+from megatron.core.transformer.experimental_attention_variant.csa import (
+    CompressedSparseAttention,
+    CompressedSparseAttentionSubmodules,
+    Compressor,
+    CompressorSubmodules,
+    CSAIndexer,
+    CSAIndexerSubmodules,
+)
+from megatron.core.transformer.experimental_attention_variant.deepseek_v4_hybrid_attention import (
+    DSv4HybridSelfAttention,
+    DSv4HybridSelfAttentionSubmodules,
+)
 from megatron.core.transformer.experimental_attention_variant.dsa import (
     DSAIndexer,
     DSAIndexerSubmodules,
     DSAttention,
     DSAttentionSubmodules,
 )
+from megatron.core.transformer.hyper_connection import HyperConnectionModule
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.multi_latent_attention import (
     MLASelfAttention,
@@ -24,6 +37,7 @@
 )
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import (
+    HyperConnectionTransformerLayer,
     TransformerLayer,
     TransformerLayerSubmodules,
     get_transformer_layer_offset,
@@ -81,17 +95,6 @@ def get_dsa_module_spec_for_backend(
     assert config.multi_latent_attention, "Currently only MLA supports sparse attention."
     assert config.qk_l2_norm is False, "qk_l2_norm is not supported with MLA."
 
-    linear_q_up_proj = (
-        backend.column_parallel_layer_norm_linear()
-        if config.qk_layernorm
-        else backend.column_parallel_linear()
-    )
-    linear_kv_up_proj = (
-        backend.column_parallel_layer_norm_linear()
-        if config.qk_layernorm
-        else backend.column_parallel_linear()
-    )
-
     # Because TransformerEngine does not support sparse attention yet, we use local
     # implementation whether the backend is TransformerEngine or not.
     core_attention = ModuleSpec(
@@ -109,19 +112,27 @@ def get_dsa_module_spec_for_backend(
         ),
     )
 
+    # Adjust for RMS norm.
+    rms_norm = config.normalization == "RMSNorm"
+    # DSA indexer requires normalized q as input, so here we cannot fuse qk layernorm
+    # with linear projection and have to use unfused qk layernorm.
+    qk_norm = (
+        backend.layer_norm(rms_norm=rms_norm, for_qk=True) if config.qk_layernorm else IdentityOp
+    )
+
     attention = ModuleSpec(
         module=MLASelfAttention,
         params={"attn_mask_type": AttnMaskType.causal},
         submodules=MLASelfAttentionSubmodules(
             linear_q_proj=backend.column_parallel_linear(),
             linear_q_down_proj=backend.linear(),
-            linear_q_up_proj=linear_q_up_proj,
+            linear_q_up_proj=backend.column_parallel_linear(),
             linear_kv_down_proj=backend.linear(),
-            linear_kv_up_proj=linear_kv_up_proj,
+            linear_kv_up_proj=backend.column_parallel_linear(),
             core_attention=core_attention,
             linear_proj=backend.row_parallel_linear(),
-            q_layernorm=IdentityOp,
-            kv_layernorm=IdentityOp,
+            q_layernorm=qk_norm,
+            kv_layernorm=qk_norm,
         ),
         metainfo={"fuse_input_layernorm": False},
     )
@@ -129,6 +140,63 @@ def get_dsa_module_spec_for_backend(
     return attention
 
 
+def get_dsv4_hybrid_module_spec_for_backend(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> ModuleSpec:
+    """Helper function to get module spec for DSv4 Hybrid Sparse Attention."""
+    assert config.multi_latent_attention, "Currently only MLA supports sparse attention."
+    assert config.qk_l2_norm is False, "qk_l2_norm is not supported with MLA."
+
+    # Adjust for RMS norm.
+    rms_norm = config.normalization == "RMSNorm"
+    # DSA indexer requires normalized q as input, so here we cannot fuse qk layernorm
+    # with linear projection and have to use unfused qk layernorm.
+    qk_norm = (
+        backend.layer_norm(rms_norm=rms_norm, for_qk=True) if config.qk_layernorm else IdentityOp
+    )
+
+    compressor_spec = ModuleSpec(
+        module=Compressor,
+        submodules=CompressorSubmodules(
+            linear_wkv=backend.linear(),
+            linear_wgate=backend.linear(),
+            norm=backend.layer_norm(rms_norm=True, for_qk=False),
+        ),
+    )
+
+    indexer_spec = ModuleSpec(
+        module=CSAIndexer,
+        submodules=CSAIndexerSubmodules(
+            linear_wq_b=backend.linear(),
+            linear_weights_proj=backend.linear(),
+            compressor=compressor_spec,
+        ),
+    )
+
+    core_attention = ModuleSpec(
+        module=CompressedSparseAttention,
+        submodules=CompressedSparseAttentionSubmodules(
+            compressor=compressor_spec, indexer=indexer_spec
+        ),
+    )
+
+    attention = ModuleSpec(
+        module=DSv4HybridSelfAttention,
+        params={"attn_mask_type": AttnMaskType.causal},
+        submodules=DSv4HybridSelfAttentionSubmodules(
+            linear_q_down_proj=backend.linear(),
+            linear_q_up_proj=backend.column_parallel_linear(),
+            linear_kv_proj=backend.column_parallel_linear(),
+            core_attention=core_attention,
+            linear_proj=backend.row_parallel_linear(),
+            q_layernorm=qk_norm,
+            kv_layernorm=qk_norm,
+        ),
+        metainfo={"fuse_input_layernorm": False},
+    )
+    return attention
+
+
 def get_experimental_attention_variant_module_spec(
     config: TransformerConfig, backend: BackendSpecProvider = None
 ) -> ModuleSpec:
@@ -141,6 +209,8 @@ def get_experimental_attention_variant_module_spec(
         return get_gated_delta_net_module_spec(config=config, backend=backend)
     elif config.experimental_attention_variant == "dsa":
         return get_dsa_module_spec_for_backend(config=config, backend=backend)
+    elif config.experimental_attention_variant == "dsv4_hybrid":
+        return get_dsv4_hybrid_module_spec_for_backend(config=config, backend=backend)
     else:
         raise ValueError(
             f"Invalid experimental attention variant: {config.experimental_attention_variant}"
@@ -152,12 +222,12 @@ def get_experimental_attention_variant_module_spec(
 ##########
 
 
-def get_transformer_block_with_experimental_attention_variant_spec(
-    config: TransformerConfig, vp_stage: Optional[int] = None, pp_rank: Optional[int] = None
-) -> TransformerBlockSubmodules:
-    """Build transformer block spec with experimental attention variants (e.g., linear attention).
+def get_transformer_layer_with_experimental_attention_variant_spec(
+    config: TransformerConfig, backend: BackendSpecProvider = None
+) -> List[ModuleSpec]:
+    """Build transformer layer specs with experimental attention variants (e.g., linear attention).
 
-    This function constructs a heterogeneous transformer block that supports mixing different
+    This function is for constructing a heterogeneous transformer that supports mixing different
     attention mechanisms (experimental vs standard) and MLP types (MoE vs dense) across layers.
     **Note that, this API is a experimental API in the short term, and might be deprecated in the
     future. In the long run, we will move to a new design that better support hybrid models.**
@@ -173,22 +243,19 @@ def get_transformer_block_with_experimental_attention_variant_spec(
         2. Per-Layer Spec Construction: Iterates through layers, constructing transformer
            layer specs based on attention and MLP patterns.
 
-        3. Pipeline Slicing: Extracts layer specs for the current pipeline stage.
-
     Args:
         config: Transformer configuration containing model hyperparameters and feature flags.
-        vp_stage: Virtual pipeline stage index for interleaved pipeline parallelism.
-        pp_rank: Pipeline model parallel rank.
 
     Returns:
-        TransformerBlockSubmodules containing per-layer specs and final layer norm.
+        List[ModuleSpec] containing per-layer specs.
 
     Note:
         Currently only supports transformer_engine backend. Kitchen backend can be used as a
         wrapper with TE fallback for unsupported operations.
     """
 
-    backend = _get_backend_spec_provider(config=config)
+    if backend is None:
+        backend = _get_backend_spec_provider(config=config)
 
     # Get attention patterns and specs
     experimental_attention_pattern = [0] * config.num_layers
@@ -227,6 +294,10 @@ def get_transformer_block_with_experimental_attention_variant_spec(
 
     # Get GPT decoder block layer specs
     rms_norm = config.normalization == "RMSNorm"
+    enable_hc = config.enable_hyper_connections
+    hc_module = HyperConnectionModule if enable_hc else IdentityOp
+    layer_module = HyperConnectionTransformerLayer if enable_hc else TransformerLayer
+
     layer_specs = []
     for layer_number in range(config.num_layers):
         attention = (
@@ -248,18 +319,56 @@ def get_transformer_block_with_experimental_attention_variant_spec(
 
         layer_specs.append(
             ModuleSpec(
-                module=TransformerLayer,
+                module=layer_module,
                 submodules=TransformerLayerSubmodules(
                     input_layernorm=input_layernorm,
                     self_attention=attention,
                     self_attn_bda=get_bias_dropout_add,
+                    self_attention_hyper_connection=hc_module,
                     pre_mlp_layernorm=pre_mlp_layernorm,
                     mlp=mlp,
                     mlp_bda=get_bias_dropout_add,
+                    mlp_hyper_connection=hc_module,
                 ),
             )
         )
 
+    return layer_specs
+
+
+def get_transformer_block_with_experimental_attention_variant_spec(
+    config: TransformerConfig, vp_stage: Optional[int] = None, pp_rank: Optional[int] = None
+) -> TransformerBlockSubmodules:
+    """Build transformer block spec with experimental attention variants (e.g., linear attention).
+
+    This function constructs a heterogeneous transformer block that supports mixing different
+    attention mechanisms (experimental vs standard) and MLP types (MoE vs dense) across layers.
+    **Note that, this API is a experimental API in the short term, and might be deprecated in the
+    future. In the long run, we will move to a new design that better support hybrid models.**
+
+    Constructing transformer layer specs by
+    `get_transformer_layer_with_experimental_attention_variant_spec` and then slicing the
+    layer specs to only include the layers that are built in this pipeline stage.
+
+    Args:
+        config: Transformer configuration containing model hyperparameters and feature flags.
+        vp_stage: Virtual pipeline stage index for interleaved pipeline parallelism.
+        pp_rank: Pipeline model parallel rank.
+
+    Returns:
+        TransformerBlockSubmodules containing per-layer specs and final layer norm.
+
+    Note:
+        Currently only supports transformer_engine backend. Kitchen backend can be used as a
+        wrapper with TE fallback for unsupported operations.
+    """
+
+    backend = _get_backend_spec_provider(config=config)
+
+    layer_specs = get_transformer_layer_with_experimental_attention_variant_spec(
+        config=config, backend=backend
+    )
+
     # Slice the layer specs to only include the layers that are built in this pipeline stage.
     if config.pipeline_model_parallel_layout is not None:
         local_layer_ids = config.pipeline_model_parallel_layout.get_layer_id_list(
@@ -273,6 +382,7 @@ def get_transformer_block_with_experimental_attention_variant_spec(
     layer_specs = [layer_specs[layer_id] for layer_id in local_layer_ids]
 
     # Get GPT decoder block spec
+    rms_norm = config.normalization == "RMSNorm"
     gpt_decoder_block_spec = TransformerBlockSubmodules(
         layer_specs=layer_specs, layer_norm=backend.layer_norm(rms_norm=rms_norm, for_qk=False)
     )
@@ -362,7 +472,7 @@ def _get_backend_spec_provider(config: TransformerConfig) -> BackendSpecProvider
     )
     backend: BackendSpecProvider = (
         KitchenSpecProvider(
-            fallback=TESpecProvider(),
+            fallback=TESpecProvider(fallback_to_eager_attn=config.fallback_to_eager_attn),
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
         )
@@ -398,6 +508,7 @@ def _get_self_attention_module_spec(
         qk_l2_norm=config.qk_l2_norm,
         use_kitchen=config.use_kitchen,
         use_te_activation_func=config.use_te_activation_func,
+        fallback_to_eager_attn=config.fallback_to_eager_attn,
         use_kitchen_attention=config.use_kitchen_attention,
         kitchen_attention_backend=config.kitchen_attention_backend,
         mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
diff --git a/megatron/core/models/gpt/fine_grained_callables.py b/megatron/core/models/gpt/fine_grained_callables.py
index fa2a2ec4934..e78e4121a8b 100644
--- a/megatron/core/models/gpt/fine_grained_callables.py
+++ b/megatron/core/models/gpt/fine_grained_callables.py
@@ -316,12 +316,6 @@ def backward_impl(self, outputs, output_grad):
         detached_grad = tuple([e.grad for e in self.detached])
         grads = output_grad + detached_grad
         self.default_backward_func(outputs + self.before_detached, grads)
-        # release the output grad memory after backward finishes,
-        # except when delay_wgrad_comptue is enabled, the grad should be
-        # kept until all modules' backward_dw has been invoked.
-        if self.delay_wgrad_compute:
-            self.output_grads = grads
-            self.delay_grads_release = len(self.bwd_dw_callables) > 0
 
         # return grads for record stream
         return grads
@@ -339,13 +333,6 @@ def backward_dw(self):
                 module.backward_dw()
             nvtx_range_pop(nvtx_msg)
 
-        # the output grad memory is last used in wgrad compute, should be safe to release.
-        assert self.delay_grads_release, "output grad memory should be valid before wgrad."
-        if self.manual_release_grads:
-            for tensor in self.output_grads:
-                tensor.untyped_storage().resize_(0)
-        self.output_grads = None
-
         self.bwd_dw_callables = None
 
     def __del__(self):
@@ -479,18 +466,16 @@ def forward_func(
                 )
                 if not isinstance(layer.mlp, MoELayer):
                     return hidden_states, None, None, None
+                mlp_norm_manager = off_interface(layer.offload_mlp_norm, hidden_states, "mlp_norm")
+                node.layer_state.mlp_norm_manager = mlp_norm_manager
                 if layer.recompute_pre_mlp_layernorm:
                     layer.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-                    with off_interface(
-                        layer.offload_mlp_norm, hidden_states, "mlp_norm"
-                    ) as hidden_states:
+                    with mlp_norm_manager as hidden_states:
                         pre_mlp_layernorm_output = layer.pre_mlp_norm_checkpoint.checkpoint(
                             apply_module(layer.pre_mlp_layernorm), hidden_states
                         )
                 else:
-                    with off_interface(
-                        layer.offload_mlp_norm, hidden_states, "mlp_norm"
-                    ) as hidden_states:
+                    with mlp_norm_manager as hidden_states:
                         pre_mlp_layernorm_output = apply_module(layer.pre_mlp_layernorm)(
                             hidden_states
                         )
@@ -604,10 +589,12 @@ def submodule_combine_forward(node: ScheduleNode, output: torch.Tensor):
             )
         # Delay the offload of the mlp norm until after the mlp_bda has been computed
         # because the residual is needed in the mlp_bda.
-        if layer.offload_mlp_norm:
-            hidden_states = off_interface.group_commit(
-                hidden_states, name="mlp_norm", forced_released_tensors=[residual]
+        mlp_norm_manager = getattr(node.layer_state, 'mlp_norm_manager', None)
+        if mlp_norm_manager is not None:
+            hidden_states = mlp_norm_manager.group_offload(
+                hidden_states, forced_released_tensors=[residual]
             )
+            node.layer_state.mlp_norm_manager = None
         output = make_viewless_tensor(
             inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
         )
diff --git a/megatron/core/models/gpt/gpt_layer_specs.py b/megatron/core/models/gpt/gpt_layer_specs.py
index 5e90f0b36be..c55e2c029b7 100755
--- a/megatron/core/models/gpt/gpt_layer_specs.py
+++ b/megatron/core/models/gpt/gpt_layer_specs.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+import copy
 import warnings
 from typing import Optional, Union
 
@@ -12,6 +13,7 @@
 from megatron.core.models.gpt.moe_module_specs import get_moe_module_spec_for_backend
 from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
 from megatron.core.transformer.enums import AttnMaskType, LayerType
+from megatron.core.transformer.hyper_connection import HyperConnectionModule
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.mlp import MLP, MLPSubmodules
 from megatron.core.transformer.multi_latent_attention import (
@@ -34,6 +36,7 @@
 )
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import (
+    HyperConnectionTransformerLayer,
     TransformerLayer,
     TransformerLayerSubmodules,
     get_transformer_layer_offset,
@@ -42,10 +45,10 @@
 from megatron.core.utils import is_te_min_version
 
 if HAVE_TE:
-    from megatron.core.extensions.transformer_engine import TEFusedMLP, TENorm
+    from megatron.core.extensions.transformer_engine import TEFusedDenseMLP, TEFusedMLP, TENorm
     from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider
 else:
-    TEFusedMLP, TENorm, TESpecProvider = None, None, None
+    TEFusedDenseMLP, TEFusedMLP, TENorm, TESpecProvider = None, None, None, None
 
 try:
     from megatron.core.extensions.kitchen import HAVE_KITCHEN, KitchenSpecProvider
@@ -54,7 +57,7 @@
     HAVE_KITCHEN = False
 
 try:
-    import apex  # type: ignore[import-untyped]  # pylint: disable=unused-import
+    import apex  # pylint: disable=unused-import
 
     from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
 
@@ -180,9 +183,12 @@ def get_gpt_layer_with_transformer_engine_submodules(
     use_te_op_fuser: Optional[bool] = False,
     use_kitchen: bool = False,
     use_te_activation_func: bool = False,
+    fallback_to_eager_attn: bool = False,
     use_kitchen_attention: bool = False,
     kitchen_attention_backend: str = "sdpa",
+    enable_hyper_connection: bool = False,
     mla_down_proj_fusion: bool = False,
+    dense_grouped_gemm: bool = False,
 ) -> TransformerLayerSubmodules:
     """Use these submodules to use lower-level Transformer Engine modules (required for fp8
     training).
@@ -197,6 +203,8 @@ def get_gpt_layer_with_transformer_engine_submodules(
         qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False.
         use_te_op_fuser (bool, optional): Use Transformer Engine's operation-based API, which may
                                           enable certain operation fusions. Defaults to False.
+        enable_hyper_connection (bool): Use HyperConnectionTransformerLayer with
+            HyperConnectionModule instead of plain TransformerLayer. Defaults to False.
         mla_down_proj_fusion (bool, optional): Enable fused q/kv down-projection and fused input
                                                layernorm when backend supports. Otherwise fall back
                                                to the unfused MLA.
@@ -214,7 +222,7 @@ def get_gpt_layer_with_transformer_engine_submodules(
     if use_kitchen:
         assert HAVE_KITCHEN
         backend: BackendSpecProvider = KitchenSpecProvider(
-            fallback=TESpecProvider(),
+            fallback=TESpecProvider(fallback_to_eager_attn=fallback_to_eager_attn),
             use_kitchen_attention=use_kitchen_attention,
             kitchen_attention_backend=kitchen_attention_backend,
         )
@@ -223,7 +231,7 @@ def get_gpt_layer_with_transformer_engine_submodules(
         if use_te_activation_func:
             raise AssertionError("use_te_activation_func not compatible with using kitchen.")
     else:
-        backend = TESpecProvider()
+        backend = TESpecProvider(fallback_to_eager_attn=fallback_to_eager_attn)
 
     mlp = get_mlp_module_spec_for_backend(
         backend=backend,
@@ -231,8 +239,11 @@ def get_gpt_layer_with_transformer_engine_submodules(
         moe_grouped_gemm=moe_grouped_gemm,
         use_te_op_fuser=use_te_op_fuser,
         use_te_activation_func=use_te_activation_func,
+        dense_grouped_gemm=dense_grouped_gemm,
     )
 
+    hc_module = HyperConnectionModule if enable_hyper_connection else IdentityOp
+
     if multi_latent_attention:
         assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA."
         linear_q_up_proj = (
@@ -302,9 +313,11 @@ def get_gpt_layer_with_transformer_engine_submodules(
                 ),
             ),
             self_attn_bda=get_bias_dropout_add,
+            self_attention_hyper_connection=hc_module,
             pre_mlp_layernorm=backend.layer_norm(has_residual=True) if num_experts else IdentityOp,
             mlp=mlp,
             mlp_bda=get_bias_dropout_add,
+            mlp_hyper_connection=hc_module,
         )
     else:
         qk_norm = backend.layer_norm(for_qk=True)
@@ -325,9 +338,11 @@ def get_gpt_layer_with_transformer_engine_submodules(
                 ),
             ),
             self_attn_bda=get_bias_dropout_add,
+            self_attention_hyper_connection=hc_module,
             pre_mlp_layernorm=backend.layer_norm(has_residual=True) if num_experts else IdentityOp,
             mlp=mlp,
             mlp_bda=get_bias_dropout_add,
+            mlp_hyper_connection=hc_module,
             sharded_state_dict_keys_map={
                 "mlp.0.weight": "mlp.linear_fc1.layer_norm_weight",
                 "mlp.0.bias": "mlp.linear_fc1.layer_norm_bias",
@@ -342,8 +357,10 @@ def get_gpt_layer_with_transformer_engine_submodules(
 @copy_signature(get_gpt_layer_with_transformer_engine_submodules)
 def get_gpt_layer_with_transformer_engine_spec(*args, **kwargs) -> ModuleSpec:
     """Use this spec to use lower-level Transformer Engine modules (required for fp8 training)."""
+    enable_hc = kwargs.get('enable_hyper_connection', False)
+    layer_module = HyperConnectionTransformerLayer if enable_hc else TransformerLayer
     return ModuleSpec(
-        module=TransformerLayer,
+        module=layer_module,
         submodules=get_gpt_layer_with_transformer_engine_submodules(*args, **kwargs),
     )
 
@@ -359,6 +376,7 @@ def get_gpt_layer_local_submodules(
     use_kitchen: bool = False,
     use_kitchen_attention: bool = False,
     kitchen_attention_backend: str = "sdpa",
+    enable_hyper_connection: bool = False,
 ) -> TransformerLayerSubmodules:
     """Use these submodules for an implementation using only modules in Megatron-Core.
 
@@ -370,6 +388,8 @@ def get_gpt_layer_local_submodules(
         multi_latent_attention (bool, optional): To use MLA. Defaults to False.
         fp8 (str, optional): Deprecated. For temporary Nemo compatibility.
         qk_l2_norm (bool, optional): To use l2 norm for queries/keys. Defaults to False.
+        enable_hyper_connection (bool): Use HyperConnectionTransformerLayer with
+            HyperConnectionModule instead of plain TransformerLayer. Defaults to False.
 
     Returns:
         TransformerLayerSubmodules: Megatron-Core modules to construct a TransformerLayer
@@ -402,6 +422,8 @@ def get_gpt_layer_local_submodules(
         backend=backend, num_experts=num_experts, moe_grouped_gemm=moe_grouped_gemm
     )
 
+    hc_module = HyperConnectionModule if enable_hyper_connection else IdentityOp
+
     if multi_latent_attention:
         assert qk_l2_norm is False, "qk_l2_norm is not supported with MLA."
         return TransformerLayerSubmodules(
@@ -422,9 +444,11 @@ def get_gpt_layer_local_submodules(
                 ),
             ),
             self_attn_bda=get_bias_dropout_add,
+            self_attention_hyper_connection=hc_module,
             pre_mlp_layernorm=layer_norm,
             mlp=mlp,
             mlp_bda=get_bias_dropout_add,
+            mlp_hyper_connection=hc_module,
         )
     else:
         return TransformerLayerSubmodules(
@@ -445,9 +469,11 @@ def get_gpt_layer_local_submodules(
                 ),
             ),
             self_attn_bda=get_bias_dropout_add,
+            self_attention_hyper_connection=hc_module,
             pre_mlp_layernorm=layer_norm,
             mlp=mlp,
             mlp_bda=get_bias_dropout_add,
+            mlp_hyper_connection=hc_module,
             sharded_state_dict_keys_map={
                 "input_layernorm.": "self_attention.linear_qkv.layer_norm_",
                 "pre_mlp_layernorm.": "mlp.linear_fc1.layer_norm_",
@@ -458,8 +484,10 @@ def get_gpt_layer_local_submodules(
 @copy_signature(get_gpt_layer_local_submodules)
 def get_gpt_layer_local_spec(*args, **kwargs) -> ModuleSpec:
     """Use this spec for an implementation using only modules in Megatron-Core."""
+    enable_hc = kwargs.get('enable_hyper_connection', False)
+    layer_module = HyperConnectionTransformerLayer if enable_hc else TransformerLayer
     return ModuleSpec(
-        module=TransformerLayer, submodules=get_gpt_layer_local_submodules(*args, **kwargs)
+        module=layer_module, submodules=get_gpt_layer_local_submodules(*args, **kwargs)
     )
 
 
@@ -516,6 +544,7 @@ def get_mlp_module_spec_for_backend(
     moe_grouped_gemm: Optional[bool] = False,
     use_te_op_fuser: Optional[bool] = False,
     use_te_activation_func: bool = False,
+    dense_grouped_gemm: bool = False,
 ) -> ModuleSpec:
     """Helper function to get module spec for MLP/MoE"""
 
@@ -524,7 +553,12 @@ def get_mlp_module_spec_for_backend(
 
     if num_experts is None:
         # Dense MLP w/ or w/o TE modules.
-        module = TEFusedMLP if use_te_op_fuser else MLP
+        if dense_grouped_gemm and use_te_op_fuser:
+            module = TEFusedDenseMLP
+        elif use_te_op_fuser:
+            module = TEFusedMLP
+        else:
+            module = MLP
         if backend.fuse_layernorm_and_linear():
             linear_fc1 = backend.column_parallel_layer_norm_linear()
             assert linear_fc1 is not None
@@ -551,12 +585,14 @@ def get_gpt_decoder_layer_specs(
     use_transformer_engine: bool,
     normalization: Optional[str] = None,
     qk_l2_norm: Optional[bool] = False,
-    vp_stage: Optional[int] = None,
-    pp_rank: Optional[int] = None,
 ) -> TransformerBlockSubmodules:
     """GPT block spec."""
+    assert config.experimental_attention_variant is None, (
+        "Experimental attention variant is not supported with get_gpt_decoder_layer_specs, "
+        f"but got {config.experimental_attention_variant=}."
+    )
+
     if use_transformer_engine:
-        layer_norm_impl = TENorm
         dense_layer_spec = get_gpt_layer_with_transformer_engine_spec(
             num_experts=None,
             moe_grouped_gemm=False,
@@ -565,6 +601,7 @@ def get_gpt_decoder_layer_specs(
             qk_l2_norm=qk_l2_norm,
             use_kitchen=config.use_kitchen,
             use_te_activation_func=config.use_te_activation_func,
+            enable_hyper_connection=config.enable_hyper_connections,
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
             mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
@@ -577,6 +614,7 @@ def get_gpt_decoder_layer_specs(
             qk_l2_norm=qk_l2_norm,
             use_kitchen=config.use_kitchen,
             use_te_activation_func=config.use_te_activation_func,
+            enable_hyper_connection=config.enable_hyper_connections,
             use_kitchen_attention=config.use_kitchen_attention,
             kitchen_attention_backend=config.kitchen_attention_backend,
             mla_down_proj_fusion=getattr(config, "mla_down_proj_fusion", False),
@@ -597,7 +635,6 @@ def get_gpt_decoder_layer_specs(
             moe_use_legacy_grouped_gemm=config.moe_use_legacy_grouped_gemm,
         )
     else:
-        layer_norm_impl = LNImpl
         dense_layer_spec = get_gpt_layer_local_spec(
             num_experts=None,
             moe_grouped_gemm=False,
@@ -606,8 +643,7 @@ def get_gpt_decoder_layer_specs(
             normalization=normalization,
             qk_l2_norm=qk_l2_norm,
             use_kitchen=config.use_kitchen,
-            use_kitchen_attention=config.use_kitchen_attention,
-            kitchen_attention_backend=config.kitchen_attention_backend,
+            enable_hyper_connection=config.enable_hyper_connections,
         )
         moe_layer_spec = get_gpt_layer_local_spec(
             num_experts=config.num_moe_experts,
@@ -617,8 +653,7 @@ def get_gpt_decoder_layer_specs(
             normalization=normalization,
             qk_l2_norm=qk_l2_norm,
             use_kitchen=config.use_kitchen,
-            use_kitchen_attention=config.use_kitchen_attention,
-            kitchen_attention_backend=config.kitchen_attention_backend,
+            enable_hyper_connection=config.enable_hyper_connections,
         )
 
     # Parse config.moe_layer_freq to determine the pattern of expert/dense layers.
@@ -666,13 +701,16 @@ def get_gpt_decoder_block_spec(
     layer_specs = get_gpt_decoder_layer_specs(
         config, use_transformer_engine, normalization, qk_l2_norm
     )
+
     # Slice the layer specs to only include the layers that are built in this pipeline stage.
     # Note: MCore layer_number starts at 1
     num_layers_to_build = get_num_layers_to_build(config, vp_stage=vp_stage, pp_rank=pp_rank)
 
     if config.pipeline_model_parallel_layout is not None:
         layout = config.pipeline_model_parallel_layout
-        assert isinstance(layout, PipelineParallelLayerLayout)
+        assert isinstance(
+            layout, PipelineParallelLayerLayout
+        ), f"Invalid pipeline model parallel layout: {layout}"
         local_layer_specs = [
             layer_specs[layer_id]
             for layer_id in layout.get_layer_id_list(
@@ -683,13 +721,13 @@ def get_gpt_decoder_block_spec(
         offset = get_transformer_layer_offset(config, vp_stage=vp_stage, pp_rank=pp_rank)
         local_layer_specs = layer_specs[offset : offset + num_layers_to_build]
 
+    # Block spec.
     if use_transformer_engine:
         layer_norm_impl = TENorm
     elif config.transformer_impl == "inference_optimized":
         layer_norm_impl = TENorm
     else:
         layer_norm_impl = LNImpl
-    # Block spec.
     block_spec = TransformerBlockSubmodules(
         layer_specs=local_layer_specs, layer_norm=layer_norm_impl
     )
@@ -706,22 +744,17 @@ def get_gpt_mtp_block_spec(
 ) -> MultiTokenPredictionBlockSubmodules:
     """GPT Multi-Token Prediction (MTP) block spec."""
     if use_transformer_engine:
-        backend: BackendSpecProvider = (
-            KitchenSpecProvider(
-                fallback=TESpecProvider(),
+        if config.use_kitchen:
+            backend: BackendSpecProvider = KitchenSpecProvider(
+                fallback=TESpecProvider(fallback_to_eager_attn=config.fallback_to_eager_attn),
                 use_kitchen_attention=config.use_kitchen_attention,
                 kitchen_attention_backend=config.kitchen_attention_backend,
             )
-            if config.use_kitchen
-            else TESpecProvider()
-        )
+        else:
+            backend = TESpecProvider(fallback_to_eager_attn=config.fallback_to_eager_attn)
     else:
         backend = (
-            KitchenSpecProvider(
-                fallback=LocalSpecProvider(),
-                use_kitchen_attention=config.use_kitchen_attention,
-                kitchen_attention_backend=config.kitchen_attention_backend,
-            )
+            KitchenSpecProvider(fallback=LocalSpecProvider())
             if config.use_kitchen
             else LocalSpecProvider()
         )
@@ -744,14 +777,18 @@ def get_gpt_mtp_block_spec_for_backend(
 
     if isinstance(spec, TransformerBlockSubmodules):
         # get the spec for the last layer of decoder block
-        transformer_layer_spec = spec.layer_specs[-1]
-    elif isinstance(spec, ModuleSpec) and spec.module == TransformerLayer:
-        transformer_layer_spec = spec
+        transformer_layer_spec = copy.copy(spec.layer_specs[-1])
+    elif isinstance(spec, ModuleSpec) and issubclass(spec.module, TransformerLayer):
+        transformer_layer_spec = copy.copy(spec)
     else:
         raise ValueError(f"Invalid spec: {spec}")
 
+    transformer_layer_spec.submodules = copy.copy(transformer_layer_spec.submodules)
+
     mtp_layer_spec = get_mtp_layer_spec_for_backend(
-        mtp_model_layer_spec=transformer_layer_spec, backend=backend
+        mtp_model_layer_spec=transformer_layer_spec,
+        backend=backend,
+        enable_hyper_connections=config.enable_hyper_connections,
     )
     mtp_num_layers = config.mtp_num_layers if config.mtp_num_layers else 0
     mtp_layer_specs = [mtp_layer_spec] * mtp_num_layers
diff --git a/megatron/core/models/gpt/gpt_model.py b/megatron/core/models/gpt/gpt_model.py
index 4fe641bb17b..6639eeceb80 100644
--- a/megatron/core/models/gpt/gpt_model.py
+++ b/megatron/core/models/gpt/gpt_model.py
@@ -25,6 +25,8 @@
 from megatron.core.quantization.utils import get_quant_config_or_none
 from megatron.core.tensor_parallel import gather_from_sequence_parallel_region
 from megatron.core.transformer.enums import CudaGraphScope, ModelType
+from megatron.core.transformer.linear_cross_entropy import LinearCrossEntropyModule
+from megatron.core.transformer.moe.paged_stash import paged_stash_init_chunk_handler
 from megatron.core.transformer.multi_token_prediction import (
     MultiTokenPredictionBlock,
     mtp_on_this_rank,
@@ -146,6 +148,11 @@ def __init__(
             self.config, ignore_virtual=False, vp_stage=vp_stage
         )
 
+        self.fuse_linear_cross_entropy = (
+            self.config.cross_entropy_loss_fusion
+            and self.config.cross_entropy_fusion_impl == "linear"
+        )
+
         if self.pre_process or self.mtp_process:
             self.embedding = LanguageModelEmbedding(
                 config=self.config,
@@ -169,7 +176,7 @@ def __init__(
                 cp_group=self.pg_collection.cp,
             )
 
-        elif self.position_embedding_type == 'yarn':
+        elif self.position_embedding_type == 'yarn' and not self.config.multi_latent_attention:
             self.rotary_pos_emb = YarnRotaryEmbedding(
                 kv_channels=self.config.kv_channels,
                 rotary_percent=rotary_percent,
@@ -241,7 +248,7 @@ def __init__(
                 self.embedding_activation_buffer = None
                 self.grad_output_buffer = None
 
-            self.output_layer = tensor_parallel.ColumnParallelLinear(
+            self.output_layer = LinearCrossEntropyModule(
                 config.hidden_size,
                 self.vocab_size,
                 config=config,
@@ -374,7 +381,7 @@ def _preprocess(
                     and packed_seq_params.qkv_format == 'thd',
                     cp_group=packed_seq_params.cp_group if packed_seq_params is not None else None,
                 )
-        elif self.position_embedding_type == 'yarn':
+        elif self.position_embedding_type == 'yarn' and not self.config.multi_latent_attention:
             if self.training or not self.config.flash_decode:
                 rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
                     inference_context, self.decoder, decoder_input, self.config, packed_seq_params
@@ -459,21 +466,30 @@ def _preprocess(
     def preprocess_for_fine_grained_offloading(self):
         """Preprocess for fine-grained activation offloading."""
         off_interface.init_chunk_handler(
+            pp_rank=self.pg_collection.pp.rank(),
             vp_size=self.config.virtual_pipeline_model_parallel_size,
             vp_stage=self.vp_stage,
             min_offloaded_tensor_size=self.config.min_offloaded_tensor_size,
+            delta_offload_bytes_across_pp_ranks=self.config.delta_offload_bytes_across_pp_ranks,
+            activation_offload_fraction=self.config.activation_offload_fraction,
         )
         if self.disable_param_offloading:
             for param in self.decoder.parameters():
-                off_interface.mark_not_offloadable(param)
+                off_interface.mark_not_offload(param)
             if self.mtp_process:
                 for param in self.mtp.parameters():
-                    off_interface.mark_not_offloadable(param)
+                    off_interface.mark_not_offload(param)
             if self.post_process:
                 for param in self.output_layer.parameters():
-                    off_interface.mark_not_offloadable(param)
+                    off_interface.mark_not_offload(param)
             self.disable_param_offloading = False
 
+    def preprocess_for_paged_stash(self):
+        """Preprocess for paged stash."""
+        return paged_stash_init_chunk_handler(
+            vp_size=self.config.virtual_pipeline_model_parallel_size, vp_stage=self.vp_stage
+        )
+
     def forward(
         self,
         input_ids: Tensor,
@@ -510,6 +526,9 @@ def forward(
         if self.config.fine_grained_activation_offloading:
             self.preprocess_for_fine_grained_offloading()
 
+        if self.config.moe_paged_stash:
+            self.preprocess_for_paged_stash()
+
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
         preproc_output = self._preprocess(
@@ -532,8 +551,13 @@ def forward(
 
         rotary_pos_cos_sin = preproc_output[6] if len(preproc_output) == 7 else None
 
+        # Pass input_ids to decoder for hash-based MoE routing
+        decoder_extra_block_kwargs = extra_block_kwargs or {}
+        if self.config.moe_n_hash_layers > 0 and input_ids is not None:
+            decoder_extra_block_kwargs['input_ids'] = input_ids
+
         # Run decoder.
-        hidden_states = self.decoder(
+        decoder_output = self.decoder(
             hidden_states=decoder_input,
             attention_mask=attention_mask,
             inference_context=inference_context,
@@ -544,8 +568,15 @@ def forward(
             packed_seq_params=packed_seq_params,
             sequence_len_offset=sequence_len_offset,
             padding_mask=padding_mask,
-            **(extra_block_kwargs or {}),
+            **decoder_extra_block_kwargs,
         )
+        # When mHC + MTP, the decoder returns (contracted, multi-stream).
+        # MTP needs multi-stream; lm_head needs contracted.
+        if isinstance(decoder_output, tuple):
+            hidden_states, mhc_multistream = decoder_output
+        else:
+            hidden_states = decoder_output
+            mhc_multistream = None
 
         return self._postprocess(
             hidden_states=hidden_states,
@@ -566,6 +597,7 @@ def forward(
             extra_block_kwargs=extra_block_kwargs,
             inference_context=inference_context,
             is_spec_decode=is_spec_decode,
+            mhc_multistream=mhc_multistream,
         )
 
     def _postprocess(
@@ -588,6 +620,7 @@ def _postprocess(
         extra_block_kwargs=None,
         inference_context=None,
         is_spec_decode=None,
+        mhc_multistream=None,
     ):
         """Postprocesses decoder hidden states to generate logits or compute loss.
 
@@ -617,6 +650,7 @@ def _postprocess(
                 input_ids=input_ids,
                 position_ids=position_ids,
                 hidden_states=hidden_states,
+                mhc_multistream=mhc_multistream,
                 attention_mask=attention_mask,
                 inference_params=None,  # MTP layers don't use KV cache
                 rotary_pos_emb=rotary_pos_emb,
@@ -674,9 +708,12 @@ def _postprocess(
                 reshaped = hidden_states.squeeze(1).unsqueeze(0)
                 hidden_states = inference_context.last_token_logits(reshaped).unsqueeze(1)
 
-        logits, _ = self.output_layer(
-            hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
-        )
+        if has_config_logger_enabled(self.config) or labels is None:
+            logits, _ = self.output_layer(
+                hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+            )
+        else:
+            logits = None
 
         # Apply MuP output scaling to logits
         logits = self._scale_logits(logits)
@@ -706,7 +743,18 @@ def _postprocess(
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
-        loss = self.compute_language_model_loss(labels, logits)
+        output_layer_kwargs = dict(
+            input_=hidden_states, weight=output_weight, runtime_gather_output=runtime_gather_output
+        )
+        if self.fuse_linear_cross_entropy:
+            loss = self.output_layer(
+                output_cross_entropy_loss=self.fuse_linear_cross_entropy,
+                labels=labels,
+                **output_layer_kwargs,
+            )
+        else:
+            logits, _ = self.output_layer(**output_layer_kwargs)
+            loss = self.compute_language_model_loss(labels, logits)
 
         return loss
 
@@ -757,6 +805,8 @@ def build_schedule_plan(
 
         if self.config.fine_grained_activation_offloading:
             self.preprocess_for_fine_grained_offloading()
+        if self.config.moe_paged_stash:
+            self.preprocess_for_paged_stash()
 
         from ..common.model_chunk_schedule_plan import TransformerModelChunkSchedulePlan
 
diff --git a/megatron/core/models/hybrid/hybrid_model.py b/megatron/core/models/hybrid/hybrid_model.py
index f1b3c102634..91728b316a8 100644
--- a/megatron/core/models/hybrid/hybrid_model.py
+++ b/megatron/core/models/hybrid/hybrid_model.py
@@ -305,19 +305,22 @@ def set_input_tensor(self, input_tensor: Tensor) -> None:
     def preprocess_for_fine_grained_offloading(self):
         """Preprocess for fine-grained activation offloading."""
         off_interface.init_chunk_handler(
+            pp_rank=self.pg_collection.pp.rank(),
             vp_size=self.config.virtual_pipeline_model_parallel_size,
             vp_stage=self.vp_stage,
             min_offloaded_tensor_size=self.config.min_offloaded_tensor_size,
+            delta_offload_bytes_across_pp_ranks=self.config.delta_offload_bytes_across_pp_ranks,
+            activation_offload_fraction=self.config.activation_offload_fraction,
         )
         if self.disable_param_offloading:
             for param in self.decoder.parameters():
-                off_interface.mark_not_offloadable(param)
+                off_interface.mark_not_offload(param)
             if self.mtp_process:
                 for param in self.mtp.parameters():
-                    off_interface.mark_not_offloadable(param)
+                    off_interface.mark_not_offload(param)
             if self.post_process:
                 for param in self.output_layer.parameters():
-                    off_interface.mark_not_offloadable(param)
+                    off_interface.mark_not_offload(param)
             self.disable_param_offloading = False
 
     def forward(
diff --git a/megatron/core/optimizer/__init__.py b/megatron/core/optimizer/__init__.py
index 4f69a9efd55..c6d3e41aed5 100644
--- a/megatron/core/optimizer/__init__.py
+++ b/megatron/core/optimizer/__init__.py
@@ -975,6 +975,8 @@ def get_megatron_optimizer(
                 # applied to the Megatron-FSDP main weight and extended to FusedAdam
                 # main weights. Override this here.
                 setattr(optimizer_part.optimizer, "master_weights", False)
+                # Megatron-FSDP always uses a decoupled gradient when using FusedAdam.
+                setattr(optimizer_part.optimizer, "use_decoupled_grad", True)
 
             optimizers.append(optimizer_part)
             model_chunk_offset += 1
diff --git a/megatron/core/optimizer/clip_grads.py b/megatron/core/optimizer/clip_grads.py
index 3d796654db4..01095a50154 100644
--- a/megatron/core/optimizer/clip_grads.py
+++ b/megatron/core/optimizer/clip_grads.py
@@ -59,18 +59,18 @@ def get_grad_norm_fp32(
 ) -> float:
     """Calculate the p-norm of gradients in FP32 precision.
 
-    This function is adapted from `torch.nn.utils.clip_grad.clip_grad_norm_` 
-    and extends it with functionality to handle model-parallel parameters. 
-    It ensures that the norm is correctly computed and reduced across 
-    the specified process group (typically the model-parallel group for 
+    This function is adapted from `torch.nn.utils.clip_grad.clip_grad_norm_`
+    and extends it with functionality to handle model-parallel parameters.
+    It ensures that the norm is correctly computed and reduced across
+    the specified process group (typically the model-parallel group for
     non-distributed optimizers or the entire world for distributed optimizers).
 
     Args:
-        grads_for_norm (Union[List[torch.Tensor], torch.Tensor]): An iterable 
+        grads_for_norm (Union[List[torch.Tensor], torch.Tensor]): An iterable
             of Tensors or a single Tensor used to calculate the gradient norm.
-        norm_type (Union[int, float]): The type of the p-norm to use. Can be 
+        norm_type (Union[int, float]): The type of the p-norm to use. Can be
             'inf' for infinity norm. Defaults to 2.
-        grad_stats_parallel_group (ProcessGroup, optional): The process group 
+        grad_stats_parallel_group (ProcessGroup, optional): The process group
             used for reducing gradient statistics (e.g., norms and zero counts).
 
     Returns:
@@ -155,13 +155,13 @@ def clip_grad_by_total_norm_fp32(
     Note that the gradients are modified in-place.
 
     Args:
-        parameters (Union[List[torch.Tensor], torch.Tensor]): An iterable of 
+        parameters (Union[List[torch.Tensor], torch.Tensor]): An iterable of
             Tensors or a single Tensor that will have gradients normalized.
-        max_norm (Union[int, float]): The maximum permissible total norm 
+        max_norm (Union[int, float]): The maximum permissible total norm
             of the gradients.
         total_norm (float): The current total norm of the gradients.
-        use_decoupled_grad (bool, optional): Whether to read from the 
-            '.decoupled_grad' attribute instead of the standard '.grad'. 
+        use_decoupled_grad (bool, optional): Whether to read from the
+            '.decoupled_grad' attribute instead of the standard '.grad'.
             Defaults to False.
     """
     # Grads.
@@ -204,19 +204,19 @@ def count_zeros_fp32(
 ) -> float:
     """Counts the number of zero values in the gradients of the given parameters.
 
-    The count is performed in FP32. This method filters parameters to ensure 
-    gradients are not double-counted by checking if the gradient is not None, 
-    the parameter is not shared, and the parameter is not a replica due 
-    to tensor model parallelism. It also handles parameters managed by 
+    The count is performed in FP32. This method filters parameters to ensure
+    gradients are not double-counted by checking if the gradient is not None,
+    the parameter is not shared, and the parameter is not a replica due
+    to tensor model parallelism. It also handles parameters managed by
     Megatron FSDP specifically.
 
     Args:
-        parameters (Union[List[torch.Tensor], torch.Tensor]): An iterable of 
+        parameters (Union[List[torch.Tensor], torch.Tensor]): An iterable of
             Tensors or a single Tensor whose gradients will be checked for zeros.
-        grad_stats_parallel_group (ProcessGroup): The process group used for 
+        grad_stats_parallel_group (ProcessGroup): The process group used for
             reducing the zero count across distributed ranks.
-        use_decoupled_grad (bool, optional): If True, reads from the 
-            '.decoupled_grad' attribute instead of the standard '.grad'. 
+        use_decoupled_grad (bool, optional): If True, reads from the
+            '.decoupled_grad' attribute instead of the standard '.grad'.
             Defaults to False.
 
     Returns:
@@ -271,4 +271,4 @@ def count_zeros_fp32(
 
     total_num_zeros = total_num_zeros.item()
 
-    return total_num_zeros
\ No newline at end of file
+    return total_num_zeros
diff --git a/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py b/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py
new file mode 100644
index 00000000000..81fd116c8ba
--- /dev/null
+++ b/megatron/core/optimizer/cpu_offloading/optimizer_state_offloader.py
@@ -0,0 +1,315 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+"""Optimizer state offloading class."""
+
+from typing import TYPE_CHECKING, Dict, List, Tuple
+
+import torch
+
+if TYPE_CHECKING:
+    from megatron.core.optimizer.distrib_optimizer import DistributedOptimizer
+
+
+class OptimizerStateOffloader:
+    """
+    Manages offloading of optimizer states and master weights to CPU.
+    Used with DistributedOptimizer to reduce GPU memory usage.
+
+    Supports overlapped D2H/H2D transfers using CUDA streams.
+
+    Master weights can be stored in two locations:
+    - In adam optimizer state (when use_precision_aware_optimizer_no_fp8_or_ds_fp8 is True)
+    - In mcore's shard_fp32_from_float16_groups
+    """
+
+    OPTIMIZER_STATE_KEYS = ('exp_avg', 'exp_avg_sq')
+    MASTER_WEIGHT_KEY = 'master_param'
+
+    def __init__(self, distrib_optimizer: "DistributedOptimizer"):
+        """
+        Args:
+            distrib_optimizer: The DistributedOptimizer to offload states and master weights from.
+        """
+        self.dist_optimizer = distrib_optimizer
+        self.adam_optimizer = distrib_optimizer.optimizer
+
+        # Only support TE FusedAdam optimizer for now.
+        try:
+            from transformer_engine.pytorch.optimizers import FusedAdam
+
+            assert isinstance(self.adam_optimizer, FusedAdam), (
+                f"OptimizerStateOffloader requires TE FusedAdam optimizer, "
+                f"but got {type(self.adam_optimizer).__name__}"
+            )
+        except ImportError:
+            raise ImportError(
+                "OptimizerStateOffloader requires transformer_engine.pytorch.optimizers.FusedAdam"
+            )
+
+        # Check if master weights are stored in adam optimizer state
+        self.optimizer_contains_master_weights = self.adam_optimizer.master_weights
+
+        # CUDA streams for async transfers
+        self._d2h_stream = torch.cuda.Stream()
+        self._h2d_stream = torch.cuda.Stream()
+
+        # CPU buffers for optimizer states: {param: {key: cpu_tensor}}
+        self._opt_state_cpu_buffers: Dict[torch.Tensor, Dict[str, torch.Tensor]] = {}
+
+        # CPU buffers for mcore master weights, matching the structure of source groups
+        # List[List[cpu_tensor]]
+        self._shard_fp32_from_float16_cpu_buffers: List[List[torch.Tensor]] = []
+
+        # State tracking
+        self._offloaded = False
+        self._offloaded_state_keys: Tuple[str, ...] = ()
+        self._offloaded_mcore_master_weights = False
+
+        # Track whether optimizer states (exp_avg, exp_avg_sq) have been initialized.
+        # These are lazily initialized by FusedAdam during the first optimizer.step().
+        # Master weights (shard_fp32_from_float16_groups) are available from the start.
+        self._optimizer_states_initialized = False
+
+    def mark_optimizer_states_initialized(self):
+        """
+        Mark that optimizer states (exp_avg, exp_avg_sq) are now available.
+        Should be called after the first optimizer.step() completes.
+        """
+        self._optimizer_states_initialized = True
+
+    def _get_state_keys_to_offload(
+        self, offload_optimizer_states: bool, offload_master_weights: bool
+    ) -> Tuple[str, ...]:
+        """Get the state keys in FusedAdam to offload based on configuration."""
+        keys = []
+        # Skip optimizer states offloading if they haven't been initialized yet.
+        # Optimizer states are lazily initialized by FusedAdam during the first optimizer.step().
+        if self._optimizer_states_initialized:
+            if offload_optimizer_states:
+                keys.extend(self.OPTIMIZER_STATE_KEYS)
+            if offload_master_weights and self.optimizer_contains_master_weights:
+                keys.append(self.MASTER_WEIGHT_KEY)
+        return tuple(keys)
+
+    def _ensure_state_cpu_buffer(
+        self, param: torch.Tensor, state_key: str, gpu_tensor: torch.Tensor, pin_memory: bool = True
+    ) -> torch.Tensor:
+        """Get or create a CPU buffer for a state tensor."""
+        if param not in self._opt_state_cpu_buffers:
+            self._opt_state_cpu_buffers[param] = {}
+
+        if state_key not in self._opt_state_cpu_buffers[param]:
+            cpu_buffer = torch.empty(
+                gpu_tensor.size(),
+                dtype=gpu_tensor.dtype,
+                layout=gpu_tensor.layout,
+                device='cpu',
+                pin_memory=pin_memory,
+            )
+            self._opt_state_cpu_buffers[param][state_key] = cpu_buffer
+
+        return self._opt_state_cpu_buffers[param][state_key]
+
+    def _offload_shard_groups(
+        self,
+        shard_groups: List[List[torch.Tensor]],
+        cpu_buffers: List[List[torch.Tensor]],
+        pin_memory: bool = True,
+    ):
+        """Offload a shard group to CPU buffers."""
+        # Initialize CPU buffers on first call
+        if len(cpu_buffers) == 0:
+            for group in shard_groups:
+                group_buffers = []
+                for gpu_tensor in group:
+                    cpu_buffer = torch.empty(
+                        gpu_tensor.size(),
+                        dtype=gpu_tensor.dtype,
+                        layout=gpu_tensor.layout,
+                        device='cpu',
+                        pin_memory=pin_memory,
+                    )
+                    group_buffers.append(cpu_buffer)
+                cpu_buffers.append(group_buffers)
+
+        # Copy D2H
+        for group_idx, group in enumerate(shard_groups):
+            for param_idx, gpu_tensor in enumerate(group):
+                cpu_buffer = cpu_buffers[group_idx][param_idx]
+                cpu_buffer.copy_(gpu_tensor, non_blocking=pin_memory)
+                gpu_tensor.record_stream(self._d2h_stream)
+
+    def _offload_states(
+        self,
+        offload_optimizer_states: bool,
+        offload_master_weights: bool,
+        use_pin_memory: bool = True,
+    ):
+        """Offload optimizer states and/or master weights to CPU."""
+        # Offload states from adam optimizer
+        self._offloaded_state_keys = self._get_state_keys_to_offload(
+            offload_optimizer_states, offload_master_weights
+        )
+        states = self.adam_optimizer.state
+
+        for param, param_state in states.items():
+            for state_key in self._offloaded_state_keys:
+                if state_key not in param_state:
+                    continue
+
+                gpu_tensor = param_state[state_key]
+                if not isinstance(gpu_tensor, torch.Tensor) or not gpu_tensor.is_cuda:
+                    continue
+
+                cpu_buffer = self._ensure_state_cpu_buffer(
+                    param, state_key, gpu_tensor, use_pin_memory
+                )
+                cpu_buffer.copy_(gpu_tensor, non_blocking=use_pin_memory)
+                gpu_tensor.record_stream(self._d2h_stream)
+
+        # Offload mcore master weights if not in optimizer state
+        if offload_master_weights and not self.optimizer_contains_master_weights:
+            self._offload_shard_groups(
+                self.dist_optimizer.shard_fp32_from_float16_groups,
+                self._shard_fp32_from_float16_cpu_buffers,
+                use_pin_memory,
+            )
+            self._offloaded_mcore_master_weights = True
+
+    def _release_states(self):
+        """Replace optimizer state GPU tensors with CPU tensors to free GPU memory."""
+        states = self.adam_optimizer.state
+
+        for param, param_state in states.items():
+            if param not in self._opt_state_cpu_buffers:
+                continue
+
+            for state_key in self._offloaded_state_keys:
+                if state_key not in self._opt_state_cpu_buffers[param]:
+                    continue
+
+                param_state[state_key].untyped_storage().resize_(0)
+
+        if self._offloaded_mcore_master_weights:
+            for group in self.dist_optimizer.shard_fp32_from_float16_groups:
+                for gpu_tensor in group:
+                    gpu_tensor.untyped_storage().resize_(0)
+
+    def _reload_shard_groups(
+        self,
+        shard_groups: List[List[torch.Tensor]],
+        cpu_buffers: List[List[torch.Tensor]],
+        is_allocate_stage: bool,
+    ):
+        """Reload shard groups from CPU to GPU."""
+        for group_idx, group in enumerate(shard_groups):
+            for param_idx, _ in enumerate(group):
+                cpu_buffer = cpu_buffers[group_idx][param_idx]
+                if is_allocate_stage:
+                    shard_groups[group_idx][param_idx].untyped_storage().resize_(
+                        cpu_buffer.untyped_storage().size()
+                    )
+                else:
+                    shard_groups[group_idx][param_idx].copy_(
+                        cpu_buffer, non_blocking=cpu_buffer.is_pinned()
+                    )
+
+    def _reload_states(self, is_allocate_stage: bool):
+        """
+        Reload optimizer states and/or master weights from CPU to GPU.
+
+        If is_allocate_stage is True, only allocate GPU memory for the states and master weights,
+        but do not copy the data from CPU to GPU. Otherwise, copy the data from CPU to GPU.
+        The two processes are separated to make sure that the GPU memory is allocated on the
+        default stream to avoid fragmentation.
+        """
+        # Reload states to adam optimizer
+        states = self.adam_optimizer.state
+
+        for param, param_state in states.items():
+            if param not in self._opt_state_cpu_buffers:
+                continue
+
+            for state_key in self._offloaded_state_keys:
+                if state_key not in self._opt_state_cpu_buffers[param]:
+                    continue
+
+                cpu_buffer = self._opt_state_cpu_buffers[param][state_key]
+                if is_allocate_stage:
+                    param_state[state_key].untyped_storage().resize_(
+                        cpu_buffer.untyped_storage().size()
+                    )
+                else:
+                    param_state[state_key].copy_(cpu_buffer, non_blocking=cpu_buffer.is_pinned())
+
+        # Reload mcore master weights if not in optimizer state
+        if self._offloaded_mcore_master_weights:
+            self._reload_shard_groups(
+                self.dist_optimizer.shard_fp32_from_float16_groups,
+                self._shard_fp32_from_float16_cpu_buffers,
+                is_allocate_stage,
+            )
+
+    def offload(self, offload_optimizer_states: bool = True, offload_master_weights: bool = True):
+        """
+        Offload optimizer states and/or master weights to CPU.
+        Starts async D2H transfer that can overlap with other operations.
+
+        Args:
+            offload_optimizer_states: Whether to offload exp_avg, exp_avg_sq.
+            offload_master_weights: Whether to offload master weights.
+        """
+        if not offload_optimizer_states and not offload_master_weights:
+            return
+
+        # Wait for current stream finishing updating the optimizer states.
+        self._d2h_stream.wait_stream(torch.cuda.current_stream())
+
+        with torch.cuda.stream(self._d2h_stream):
+            self._offload_states(offload_optimizer_states, offload_master_weights)
+
+        self._offloaded = True
+
+    def release_gpu_memory(self):
+        """
+        Release GPU memory for optimizer states and master weights after D2H copy completes.
+
+        This is separated from offload() to allow delayed GPU memory release,
+        which is needed for mxfp8 + overlap_param_gather case where master weights
+        must remain on GPU until after _copy_main_params_to_param_buffer() is called.
+        """
+        if not self._offloaded:
+            return
+
+        self._release_states()
+
+    def reload(self):
+        """
+        Reload optimizer states and/or master weights from CPU to GPU.
+        Call before optimizer.step() to ensure states are on GPU.
+        """
+        if not self._offloaded:
+            return
+
+        # Allocate GPU memory on the current stream to avoid fragmentation.
+        self._reload_states(is_allocate_stage=True)
+
+        self._h2d_stream.wait_stream(self._d2h_stream)
+        self._h2d_stream.wait_stream(torch.cuda.current_stream())
+
+        # Reload states on the h2d stream to overlap with other operations.
+        with torch.cuda.stream(self._h2d_stream):
+            self._reload_states(is_allocate_stage=False)
+
+        self._offloaded_state_keys = ()
+        self._offloaded_mcore_master_weights = False
+        self._offloaded = False
+
+    def sync_before_step(self):
+        """
+        Wait for H2D reload to complete before optimizer.step().
+        Must be called to ensure states are on GPU before optimizer uses them.
+
+        This is separated from reload() to make it possible to move the reload ahead of time.
+        """
+        torch.cuda.current_stream().wait_stream(self._h2d_stream)
diff --git a/megatron/core/optimizer/distrib_optimizer.py b/megatron/core/optimizer/distrib_optimizer.py
index b4d52e6b56b..490ca614b21 100644
--- a/megatron/core/optimizer/distrib_optimizer.py
+++ b/megatron/core/optimizer/distrib_optimizer.py
@@ -52,6 +52,7 @@
 from ..fp8_utils import dequantize_fp8_tensor, is_float8tensor, quantize_param_shard
 from ..transformer.fsdp_dtensor_checkpoint import handle_experts_in_state_dict
 from ..transformer.module import MegatronModule
+from .cpu_offloading.optimizer_state_offloader import OptimizerStateOffloader
 from .grad_scaler import MegatronGradScaler
 from .optimizer import MixedPrecisionOptimizer, _zero_grad_group_helper, param_group_identifier_keys
 from .optimizer_config import OptimizerConfig
@@ -374,9 +375,10 @@ def _build_model_and_main_param_groups(
 
                     # Generate sharded model param.
                     if (
-                        is_float8tensor(model_param) and config.fp8_recipe != "delayed"
+                        cls._is_distopt_quantized_param(model_param)
+                        and config.fp8_recipe != "delayed"
                     ) or is_nvfp4tensor(model_param):
-                        # MXFP8Tensor, BlockwiseQTensor, and NVFP4Tensor don't support view(-1)
+                        # MXFP8Tensor, BlockwiseQTensor, NVFP4Tensor don't support view(-1)
                         shard_model_param = None
                     else:
                         shard_model_param = model_param.detach().view(-1)[
@@ -395,7 +397,9 @@ def _build_model_and_main_param_groups(
                         # precision at the beginning of training (this problem will not occur if the
                         # training is long enough or if the main params are loaded from a
                         # checkpoint).
-                        if is_nvfp4tensor(model_param) or is_float8tensor(model_param):
+                        if cls._is_distopt_quantized_param(model_param) or is_nvfp4tensor(
+                            model_param
+                        ):
                             if hasattr(model_param, 'get_high_precision_init_val'):
                                 shard_main_param = (
                                     model_param.get_high_precision_init_val()
@@ -533,6 +537,8 @@ def __init__(
             "due to checkpointing requirements."
         )
 
+        self._state_offloader: Optional[OptimizerStateOffloader] = None
+
         # when freezing sub-models we have no real optimizer
         # but still need a stub DistributedOptimizer class
         if optimizer is None:
@@ -622,6 +628,9 @@ def __init__(
             self.optimizer.param_groups = [g["orig_group"] for g in self.opt_group_ranges]
             self.optimizer.load_state_dict(self.optimizer.state_dict())
 
+        if self.config.offload_optimizer_states:
+            self._state_offloader = OptimizerStateOffloader(self)
+
     def _get_model_param_range_map(self, param: torch.nn.Parameter):
         """
         Given a model param, get the index sub-range of the param that this
@@ -930,6 +939,70 @@ def _get_main_param_and_optimizer_states(self, model_param):
                     tensors[k] = v
         return tensors
 
+    @staticmethod
+    def _is_grouped_quantized_tensor(tensor: torch.Tensor) -> bool:
+        """Check if tensor is a TE GroupedTensor using quantized storage."""
+        return (
+            hasattr(tensor, "split_into_quantized_tensors")
+            and callable(tensor.split_into_quantized_tensors)
+            and getattr(tensor, "quantizer", None) is not None
+        )
+
+    @classmethod
+    def _is_distopt_quantized_param(cls, tensor: torch.Tensor) -> bool:
+        """Check if tensor should follow quantized parameter path in dist optimizer."""
+        return is_float8tensor(tensor) or cls._is_grouped_quantized_tensor(tensor)
+
+    def _expand_quantized_param_shard_for_cast(
+        self,
+        model_param: torch.Tensor,
+        shard_main_param: Optional[torch.Tensor],
+        start_offset: Optional[int],
+    ):
+        """Expand one quantized model param to cast-ready entries.
+
+        For grouped quantized tensors, split into member quantized tensors and map the sharded
+        master slice to per-member offset ranges, while preserving deterministic ordering across
+        DP ranks.
+        """
+        if not self._is_grouped_quantized_tensor(model_param):
+            return [model_param], [shard_main_param], [start_offset]
+
+        quantized_members = model_param.quantized_tensors
+        if quantized_members is None:
+            quantized_members = model_param.split_into_quantized_tensors()
+
+        shard_start = 0 if start_offset is None else start_offset
+        shard_size = 0 if shard_main_param is None else shard_main_param.numel()
+        shard_end = shard_start + shard_size
+        shard_flat = None if shard_main_param is None else shard_main_param.view(-1)
+
+        expanded_model_params = []
+        expanded_shard_main_params = []
+        expanded_start_offsets = []
+        member_offset = 0
+        for member in quantized_members:
+            member_numel = member.numel()
+            member_start = member_offset
+            member_end = member_start + member_numel
+            overlap_start = max(member_start, shard_start)
+            overlap_end = min(member_end, shard_end)
+
+            member_master = None
+            member_start_offset = None
+            if overlap_start < overlap_end:
+                local_start = overlap_start - shard_start
+                local_end = overlap_end - shard_start
+                member_master = shard_flat[local_start:local_end]
+                member_start_offset = overlap_start - member_start
+
+            expanded_model_params.append(member)
+            expanded_shard_main_params.append(member_master)
+            expanded_start_offsets.append(member_start_offset)
+            member_offset = member_end
+
+        return expanded_model_params, expanded_shard_main_params, expanded_start_offsets
+
     def _set_main_param_and_optimizer_states(self, model_param, tensors):
         """Set the main param and optimizer states corresponding to the input model_param.
 
@@ -2166,7 +2239,7 @@ def split_state_dict_if_needed(self, state_dict):
         fp8_gbuf_indices = []
         for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
             for dtype, _ in gbuf_range_maps.items():
-                if is_float8tensor(self.buffers[gbuf_idx].params[0]):
+                if self._is_distopt_quantized_param(self.buffers[gbuf_idx].params[0]):
                     fp8_gbuf_indices.append(gbuf_idx)
         if len(fp8_gbuf_indices) == 0:
             return
@@ -2188,7 +2261,7 @@ def split_state_dict_if_needed(self, state_dict):
         new_state_dict = {'buckets_coalesced': state_dict['buckets_coalesced']}
         for gbuf_idx, gbuf_range_maps in enumerate(self.gbuf_ranges):
             for dtype, _ in gbuf_range_maps.items():
-                if not is_float8tensor(self.buffers[gbuf_idx].params[0]):
+                if not self._is_distopt_quantized_param(self.buffers[gbuf_idx].params[0]):
                     new_state_dict[gbuf_idx] = state_dict[dtype_to_gbuf_idx[dtype]]
 
         for fp8_gbuf_idx in fp8_gbuf_indices:
@@ -2390,7 +2463,7 @@ def _get_fp8_params_and_shard_fp32_from_fp8(self):
         idx = 0
         for buffer in buffers:
             for param in buffer.params:
-                if is_float8tensor(param):
+                if self._is_distopt_quantized_param(param):
                     fp8_params.append(param)
                     shard_fp32_from_fp8.append(None)
                     shard_offsets_in_fp8.append(None)
@@ -2405,7 +2478,7 @@ def get_shard_fp32_from_fp8(shard_main_groups, model_groups):
             """
             for shard_main_group, model_group in zip(shard_main_groups, model_groups):
                 for shard_main_param, model_param in zip(shard_main_group, model_group):
-                    if is_float8tensor(model_param):
+                    if self._is_distopt_quantized_param(model_param):
                         param_range_map = self._get_model_param_range_map(model_param)
                         param_range = param_range_map["param"]
                         assert param_range.size == shard_main_param.nelement()
@@ -2527,17 +2600,36 @@ def _copy_main_params_to_model_params(self):
         if self.config.use_precision_aware_optimizer_no_fp8_or_ds_fp8:
             return
 
-        if self.ddp_config.fp8_param_gather:
-            quantize_param_shard(
-                *self._get_fp8_params_and_shard_fp32_from_fp8(), self.data_parallel_group
-            )
-        elif self.ddp_config.fp4_param_gather:
+        if self.ddp_config.fp4_param_gather:
             # Quantize FP32 master shards back to NVFP4 model params (rowwise only)
             quantize_nvfp4_param_shard(
                 *self._get_nvfp4_params_and_shard_fp32_from_nvfp4(), self.data_parallel_group
             )
-        else:
-            pass
+
+        fp8_params, shard_fp32_from_fp8, shard_offsets_in_fp8 = (
+            self._get_fp8_params_and_shard_fp32_from_fp8()
+        )
+        expanded_fp8_params = []
+        expanded_shard_fp32_from_fp8 = []
+        expanded_shard_offsets_in_fp8 = []
+        for model_param, shard_main_param, start_offset in zip(
+            fp8_params, shard_fp32_from_fp8, shard_offsets_in_fp8
+        ):
+            sub_model_params, sub_shard_main_params, sub_start_offsets = (
+                self._expand_quantized_param_shard_for_cast(
+                    model_param, shard_main_param, start_offset
+                )
+            )
+            expanded_fp8_params.extend(sub_model_params)
+            expanded_shard_fp32_from_fp8.extend(sub_shard_main_params)
+            expanded_shard_offsets_in_fp8.extend(sub_start_offsets)
+
+        quantize_param_shard(
+            expanded_fp8_params,
+            expanded_shard_fp32_from_fp8,
+            expanded_shard_offsets_in_fp8,
+            self.data_parallel_group,
+        )
 
         # Utility method for copying group params.
         def copy_group_params(shard_main_groups, model_groups):
@@ -2556,12 +2648,8 @@ def copy_group_params(shard_main_groups, model_groups):
                         world_range.start : world_range.end
                     ]
 
-                    if is_float8tensor(model_param):
-                        # FP8 params are quantized in the above "quantize_param_shard" function.
-                        continue
-                    elif is_nvfp4tensor(model_param):
-                        # NVFP4 params are quantized in the above "quantize_nvfp4_param_shard"
-                        # function.
+                    if self._is_distopt_quantized_param(model_param) or is_nvfp4tensor(model_param):
+                        # Quantized params are handled above.
                         continue
                     else:
                         shard_model_param.data.copy_(shard_main_param)
@@ -2678,8 +2766,12 @@ def copy_group_params(model_groups, shard_main_groups):
                         # Use param from state_dict to initialize main_param
                         model_param = model_param_to_state_dict_param_map[model_param]
 
-                    if is_float8tensor(model_param):
-                        shard_model_param = dequantize_fp8_tensor(model_param).view(-1)[
+                    if self._is_distopt_quantized_param(model_param):
+                        if self._is_grouped_quantized_tensor(model_param):
+                            dequantized_model_param = model_param.float()
+                        else:
+                            dequantized_model_param = dequantize_fp8_tensor(model_param)
+                        shard_model_param = dequantized_model_param.view(-1)[
                             param_range.start : param_range.end
                         ]
                     else:
@@ -2698,6 +2790,8 @@ def step_with_ready_grads(self) -> bool:
         Under the hood, either launch synchronous param all-gathers or get ready to launch
         asynchorous all-gathers that get overlapped with the next forward pass.
         """
+        if self._state_offloader is not None:
+            self._state_offloader.sync_before_step()
         update_successful = super().step_with_ready_grads()
 
         timers = self.config.timers
@@ -2720,4 +2814,22 @@ def step_with_ready_grads(self) -> bool:
         if timers is not None:
             timers('params-all-gather').stop()
 
+        if self._state_offloader is not None:
+            self._state_offloader.mark_optimizer_states_initialized()
+
         return update_successful
+
+    def offload_states(self):
+        """Offload states to CPU."""
+        if self._state_offloader is not None:
+            self._state_offloader.offload()
+
+    def reload_offloaded_states(self):
+        """Start async reload of offloaded states."""
+        if self._state_offloader is not None:
+            self._state_offloader.reload()
+
+    def release_offloaded_gpu_states(self):
+        """Release GPU memory after D2H completes. For delayed release case."""
+        if self._state_offloader is not None:
+            self._state_offloader.release_gpu_memory()
diff --git a/megatron/core/optimizer/grad_scaler.py b/megatron/core/optimizer/grad_scaler.py
index 03656488982..a130e54ba28 100644
--- a/megatron/core/optimizer/grad_scaler.py
+++ b/megatron/core/optimizer/grad_scaler.py
@@ -14,6 +14,7 @@ class MegatronGradScaler(ABC):
     Args:
         initial_scale (float): The initial value for the loss scale.
     """
+
     def __init__(self, initial_scale: float):
         """Initialize scale value with the input initial scale."""
         assert initial_scale > 0.0
@@ -21,23 +22,25 @@ def __init__(self, initial_scale: float):
 
     @property
     def scale(self):
+        """Return the current loss scale."""
         return self._scale
 
     @property
     def inv_scale(self):
+        """Return the reciprocal of the current loss scale."""
         return self._scale.double().reciprocal().float()
 
     @abstractmethod
     def update(self, found_inf: bool):
-        pass
+        """Update the loss scale based on whether inf/NaN was found."""
 
     @abstractmethod
     def state_dict(self):
-        pass
+        """Return the state dictionary for checkpointing."""
 
     @abstractmethod
     def load_state_dict(self, state_dict: Dict):
-        pass
+        """Load state from a checkpoint dictionary."""
 
 
 class ConstantGradScaler(MegatronGradScaler):
@@ -61,23 +64,23 @@ def load_state_dict(self, state_dict):
 class DynamicGradScaler(MegatronGradScaler):
     """Gradient scaler with a dynamic scale factor adjusted during training.
 
-    This class implements a loss scaling strategy to prevent numerical underflow 
-    during mixed-precision training. It reduces the loss scale by a 
-    `backoff_factor` if a `hysteresis` number of NaNs/Infs are detected in 
-    consecutive iterations. Conversely, it increases the loss scale by a 
-    `growth_factor` if no non-finite gradients are seen for a specified 
+    This class implements a loss scaling strategy to prevent numerical underflow
+    during mixed-precision training. It reduces the loss scale by a
+    `backoff_factor` if a `hysteresis` number of NaNs/Infs are detected in
+    consecutive iterations. Conversely, it increases the loss scale by a
+    `growth_factor` if no non-finite gradients are seen for a specified
     `growth_interval` of iterations.
 
     Args:
         initial_scale (float): The starting value for the loss scale.
         min_scale (float): The lower bound for the loss scale.
-        growth_factor (float): The multiplier used to increase the scale when 
+        growth_factor (float): The multiplier used to increase the scale when
             gradients are stable. Must be greater than 1.0.
-        backoff_factor (float): The multiplier used to decrease the scale when 
+        backoff_factor (float): The multiplier used to decrease the scale when
             non-finite gradients are detected. Must be between 0.0 and 1.0.
-        growth_interval (int): The number of consecutive stable iterations 
+        growth_interval (int): The number of consecutive stable iterations
             required before increasing the scale.
-        hysteresis (int): The number of consecutive non-finite iterations 
+        hysteresis (int): The number of consecutive non-finite iterations
             required before decreasing the scale.
     """
 
@@ -96,12 +99,14 @@ def __init__(
         Args:
             initial_scale (float): Initial loss scale value.
             min_scale (float): Minimum loss scale value.
-            growth_factor (float): Factor to grow loss scale by if NaNs are not seen in `growth_interval`
-                training iterations. Must be greater than 1.
-            backoff_factor (float): Factor to decrease loss scale by if NaNs are seen in `hysteresis`
-                consecutive training iterations. Must be between 0 and 1.
-            growth_interval (int): Number of training iterations of no NaNs before loss scale is increased.
-            hysteresis (int): Number of training iterations of consecutive NaNs before loss scale is decreased.
+            growth_factor (float): Factor to grow loss scale by if NaNs
+                are not seen in ``growth_interval`` iterations. Must be > 1.
+            backoff_factor (float): Factor to decrease loss scale by if NaNs
+                are seen in ``hysteresis`` consecutive iterations. 0 < x < 1.
+            growth_interval (int): Iterations without NaNs before the loss
+                scale is increased.
+            hysteresis (int): Consecutive NaN iterations before the loss
+                scale is decreased.
         """
         super(DynamicGradScaler, self).__init__(initial_scale)
 
diff --git a/megatron/core/optimizer/optimizer_config.py b/megatron/core/optimizer/optimizer_config.py
index 514d109ddc8..5fbf6710f76 100644
--- a/megatron/core/optimizer/optimizer_config.py
+++ b/megatron/core/optimizer/optimizer_config.py
@@ -142,7 +142,6 @@ class OptimizerConfig:
     ##############
     # General
     ##############
-
     lr: Optional[float] = None
     """Initial learning rate. Depending on decay style and initial warmup, the learning rate at each
        iteration would be different.
@@ -365,6 +364,12 @@ class OptimizerConfig:
     pin_cpu_params: bool = True
     """If True, pin the optimizer parameters to CPU memory."""
 
+    offload_optimizer_states: bool = False
+    """
+    If True, offload optimizer states to CPU after each optimizer step and
+    reload them before the next optimizer step.
+    """
+
     ################
     # Miscellaneous
     ################
diff --git a/megatron/core/parallel_state.py b/megatron/core/parallel_state.py
index 337485b4d12..863b5d55d9d 100644
--- a/megatron/core/parallel_state.py
+++ b/megatron/core/parallel_state.py
@@ -115,8 +115,8 @@
 _CONTEXT_PARALLEL_GLOBAL_RANKS = None
 # Hierarchical context parallel groups
 _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS = None
-# Hybrid context parallel groups
-_HYBRID_DP_CP_GROUPS = {}
+# Dynamic context parallel groups
+_DYNAMIC_DP_CP_GROUPS = {}
 
 # Data parallel group information with context parallel combined.
 _DATA_PARALLEL_GROUP_WITH_CP = None
@@ -418,29 +418,27 @@ def create_hierarchical_groups(
     return hierarchical_groups, hierarchical_groups_gloo
 
 
-def create_hybrid_dp_cp_groups(rank, ranks, pg_options):
+def create_dynamic_dp_cp_groups(rank, ranks, pg_options, min_cp_size=1):
     """
-    Creates groups required for hybrid DPxCP.
-    Creates a new group for every power of 2 up to the number of DPxCP ranks.
+    Creates groups required for dynamic DPxCP.
+    Creates a new group for every power of 2 from min_cp_size up to len(ranks).
     Returns a dictionary indexed by group size.
     """
-    hybrid_dp_cp_groups = {}
-    # Generate group for every power of 2 up to the number of CP ranks
-    # We limit the allowed group sizes in order to avoid excessive overhead.
-    group_sizes = [2**i for i in range(int(log2(len(ranks))))][1:]
+    dynamic_dp_cp_groups = {}
+    group_sizes = [2**i for i in range(int(log2(len(ranks)))) if 2**i >= min_cp_size]
     for group_size in group_sizes:
         for i in range(0, len(ranks), group_size):
             group = create_group(
                 ranks[i : i + group_size],
                 pg_options=pg_options,
-                group_desc=f"HYBRID_DP_CP_GROUP_{group_size}",
+                group_desc=f"DYNAMIC_DP_CP_GROUP_{group_size}",
             )
             if rank in ranks[i : i + group_size]:
                 assert (
-                    group_size not in hybrid_dp_cp_groups
-                ), f"Rank {rank} appears in multiple Hybrid DP CP groups of size {group_size}"
-                hybrid_dp_cp_groups[group_size] = group
-    return hybrid_dp_cp_groups
+                    group_size not in dynamic_dp_cp_groups
+                ), f"Rank {rank} appears in multiple Dynamic DP CP groups of size {group_size}"
+                dynamic_dp_cp_groups[group_size] = group
+    return dynamic_dp_cp_groups
 
 
 class RankGenerator(object):
@@ -552,7 +550,8 @@ def initialize_model_parallel(
     use_sharp: bool = False,
     context_parallel_size: int = 1,
     hierarchical_context_parallel_sizes: Optional[List[int]] = None,
-    hybrid_context_parallel: bool = False,
+    dynamic_context_parallel: bool = False,
+    min_dynamic_context_parallel_size: int = 1,
     expert_model_parallel_size: int = 1,
     num_distributed_optimizer_instances: int = 1,
     expert_tensor_parallel_size: Optional[int] = None,
@@ -919,18 +918,34 @@ def initialize_model_parallel(
         if "NCCL_COLLNET_ENABLE" in os.environ:
             del os.environ["NCCL_COLLNET_ENABLE"]
 
-    if hybrid_context_parallel:
-        global _HYBRID_DP_CP_GROUPS
+    if dynamic_context_parallel:
+        # TODO: Are gloo groups needed for Dynamic CP?
+        global _DYNAMIC_DP_CP_GROUPS
         for ranks_with_cp in decoder_rank_generator.get_ranks('dp-cp'):
             assert (
                 len(ranks_with_cp) % 2 == 0
-            ), "Hybrid context parallel requires an even number of ranks"
-            _HYBRID_DP_CP_GROUPS.update(
-                create_hybrid_dp_cp_groups(
-                    rank, ranks_with_cp, get_nccl_options("dp_cp", nccl_comm_cfgs)
+            ), "Dynamic context parallel requires an even number of ranks"
+            _DYNAMIC_DP_CP_GROUPS.update(
+                create_dynamic_dp_cp_groups(
+                    rank,
+                    ranks_with_cp,
+                    get_nccl_options("dp_cp", nccl_comm_cfgs),
+                    min_cp_size=min_dynamic_context_parallel_size,
                 )
             )
-        # TODO: Are gloo groups needed for hybrid cp?
+
+        data_parallel_size_with_cp = data_parallel_size * context_parallel_size
+        group_sizes = [
+            2**i
+            for i in range(int(log2(data_parallel_size_with_cp)))
+            if 2**i >= min_dynamic_context_parallel_size
+        ]
+        if data_parallel_size_with_cp not in group_sizes:
+            group_sizes.append(data_parallel_size_with_cp)
+        for group_size in group_sizes:
+            group = get_dynamic_data_context_parallel_groups(group_size=group_size)
+            torch.distributed.barrier(group=group, device_ids=[torch.cuda.current_device()])
+            torch.cuda.synchronize()
 
     for ranks in decoder_rank_generator.get_ranks('dp'):
         group = create_group(
@@ -1523,16 +1538,15 @@ def get_hierarchical_context_parallel_groups(check_initialized=True):
     return _HIERARCHICAL_CONTEXT_PARALLEL_GROUPS
 
 
-def get_hybrid_data_context_parallel_groups(check_initialized=True, group_size=None):
-    """Get the hybrid context parallel groups the caller rank belongs to."""
-    # If the group size is the same as the entire DPxCP group, return the original group
+def get_dynamic_data_context_parallel_groups(check_initialized=True, group_size=None):
+    """Get the dynamic context parallel groups the caller rank belongs to."""
     if get_data_parallel_world_size(with_context_parallel=True) == group_size:
         if check_initialized:
             assert _DATA_PARALLEL_GROUP_WITH_CP is not None
         return _DATA_PARALLEL_GROUP_WITH_CP
     if check_initialized:
-        assert _HYBRID_DP_CP_GROUPS is not None
-    return _HYBRID_DP_CP_GROUPS[group_size]
+        assert _DYNAMIC_DP_CP_GROUPS is not None
+    return _DYNAMIC_DP_CP_GROUPS[group_size]
 
 
 def get_embedding_group(check_initialized=True):
@@ -2113,6 +2127,9 @@ def destroy_model_parallel():
     global _CONTEXT_PARALLEL_GLOBAL_RANKS
     _CONTEXT_PARALLEL_GLOBAL_RANKS = None
 
+    global _DYNAMIC_DP_CP_GROUPS
+    _DYNAMIC_DP_CP_GROUPS = {}
+
     global _EMBEDDING_GROUP
     _EMBEDDING_GROUP = None
 
diff --git a/megatron/core/pipeline_parallel/combined_1f1b.py b/megatron/core/pipeline_parallel/combined_1f1b.py
index fdd3b32201f..ffb9aa0e3e2 100644
--- a/megatron/core/pipeline_parallel/combined_1f1b.py
+++ b/megatron/core/pipeline_parallel/combined_1f1b.py
@@ -52,7 +52,7 @@ def combined_1f1b_schedule_for_no_pipelining(
     Phases 4: 4th microbatch backward
     """
 
-    set_streams()
+    set_streams(high_priority=config.high_priority_a2a_comm_stream)
     # The forward step for the first microbatch is executed alone, no a2a overlapping
     output_tensor, num_tokens, _ = combined_forward_backward_step(
         forward_step_func,
@@ -178,7 +178,7 @@ def combined_1f1b_schedule_for_interleaved_pipelining():
                 # backward_step_helper_postprocess()
     """
 
-    set_streams()
+    set_streams(high_priority=config.high_priority_a2a_comm_stream)
     # forward prepare
     f_model_chunk_id = None
     f_microbatch_id = None
diff --git a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py
index 2314fc7c941..87adfc6c593 100644
--- a/megatron/core/pipeline_parallel/fine_grained_activation_offload.py
+++ b/megatron/core/pipeline_parallel/fine_grained_activation_offload.py
@@ -5,6 +5,7 @@
 from typing import Any, Dict, Tuple
 
 import torch
+from torch.autograd.graph import saved_tensors_hooks
 
 # CPU offload implementation for pipeline parallelism
 DEBUG = False
@@ -95,9 +96,9 @@ def print_offload_summary_table(total_offload_bytes: Dict[str, int]):
     torch.distributed.barrier()
 
 
-class GPUTensorPool:
+class OffloadTensorPool:
     """
-    GPU memory pool for efficient allocation and deallocation of tensors.
+    Memory pool for efficient allocation and deallocation of tensors.
 
     Features:
     - Supports multiple tensor shapes and dtypes, each with its own pool
@@ -106,7 +107,7 @@ class GPUTensorPool:
     - Uses queue-based management for O(1) allocation and deallocation
 
     Example:
-        pool = GPUTensorPool(device='cuda:0')
+        pool = OffloadTensorPool(device='cuda:0')
         tensor = pool.allocate((128, 512), dtype=torch.float32)
         # ... use tensor ...
         pool.free(tensor, (128, 512), dtype=torch.float32)
@@ -114,10 +115,10 @@ class GPUTensorPool:
 
     def __init__(self, device: str = 'cuda', pin_memory: bool = False):
         """
-        Initialize GPU tensor pool.
+        Initialize offload tensor pool.
 
         Args:
-            device: GPU device, default 'cuda'
+            device: Device, default 'cuda'
             pin_memory: Whether to use pinned memory (mainly for CPU tensors)
         """
         self.device = torch.device(device)
@@ -137,7 +138,7 @@ def __init__(self, device: str = 'cuda', pin_memory: bool = False):
             'pool_misses': 0,  # Number of times a new tensor was created
         }
 
-        debug_rank("GPUTensorPool: Initialized with dynamic allocation")
+        debug_rank("OffloadTensorPool: Initialized with dynamic allocation")
 
     def _get_pool_key(self, shape: Tuple, dtype: torch.dtype) -> Tuple:
         """Generate a unique key for the pool based on shape and dtype."""
@@ -182,7 +183,7 @@ def allocate(self, shape: Tuple, dtype: torch.dtype = torch.float32) -> torch.Te
             tensor = pool['free'].popleft()
             self._stats['pool_hits'] += 1
             debug_rank(
-                f"GPUTensorPool.allocate: Reused tensor from pool, "
+                f"OffloadTensorPool.allocate: Reused tensor from pool, "
                 f"shape={shape}, dtype={dtype}, "
                 f"remaining in pool={len(pool['free'])}"
             )
@@ -195,7 +196,7 @@ def allocate(self, shape: Tuple, dtype: torch.dtype = torch.float32) -> torch.Te
 
             memory_mb = self._calculate_memory_size(shape, dtype) / (1024**2)
             debug_rank(
-                f"GPUTensorPool.allocate: Created new tensor, "
+                f"OffloadTensorPool.allocate: Created new tensor, "
                 f"shape={shape}, dtype={dtype}, "
                 f"memory={memory_mb:.2f} MB, "
                 f"total_created={len(pool['all'])}"
@@ -245,7 +246,7 @@ def free(self, tensor: torch.Tensor):
         self._stats['current_in_use'] -= 1
 
         debug_rank(
-            f"GPUTensorPool.free: shape={shape}, dtype={dtype}, "
+            f"OffloadTensorPool.free: shape={shape}, dtype={dtype}, "
             f"available in pool={len(pool['free'])}"
         )
 
@@ -294,7 +295,7 @@ def get_pool_status(self, shape: Tuple = None, dtype: torch.dtype = None) -> Dic
 
     def reset(self):
         """Reset the pool, marking all tensors as available."""
-        debug_rank("GPUTensorPool: Resetting pool...")
+        debug_rank("OffloadTensorPool: Resetting pool...")
 
         for pool_key, pool in self._pools.items():
             # Clear and refill the free queue
@@ -304,11 +305,11 @@ def reset(self):
             pool['allocated_count'] = 0
 
         self._stats['current_in_use'] = 0
-        debug_rank("GPUTensorPool: Reset complete")
+        debug_rank("OffloadTensorPool: Reset complete")
 
     def clear(self):
         """Clear the pool and release all GPU memory."""
-        debug_rank("GPUTensorPool: Clearing pool...")
+        debug_rank("OffloadTensorPool: Clearing pool...")
 
         for pool_key, pool in self._pools.items():
             # Clear all references, allowing PyTorch GC to reclaim memory
@@ -322,7 +323,7 @@ def clear(self):
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
 
-        debug_rank("GPUTensorPool: Clear complete")
+        debug_rank("OffloadTensorPool: Clear complete")
 
     def __del__(self):
         """Destructor to ensure resources are released."""
@@ -411,11 +412,16 @@ def __init__(self):
         # allocate streams and events for synchronization
         self._d2h_stream = torch.cuda.Stream()
         self._h2d_stream = torch.cuda.Stream()
+        # CUDA graph stream and event for offloading modules in cuda graph
+        self._cuda_graph_stream = torch.cuda.Stream()
+        self._cuda_graph_event = torch.cuda.Event(external=True)
         # Shared CPU tensor pool for all chunks to improve reuse efficiency
-        self._cpu_tensor_pool = GPUTensorPool(device="cpu", pin_memory=True)
+        self._cpu_tensor_pool = OffloadTensorPool(device="cpu", pin_memory=True)
 
         # Whether the manager is in warmup phase.
         self._is_warmup = True
+        # Whether the manager is in CUDA graph replay phase.
+        self._in_replay = False
         # Cache OffloadChunkHandler objects for each virtual pipeline stage and each forward pass.
         self._cached_chunks_forward = []
         # Cache OffloadChunkHandler objects for each virtual pipeline stage and each backward pass.
@@ -434,6 +440,10 @@ def __init__(self):
         self._delayed_offload_groups = []
         self.reset()
 
+        self._saved_tensors_hooks = saved_tensors_hooks(
+            self.on_save_for_backward, self.on_get_saved_tensor
+        )
+
     @property
     def d2h_stream(self):
         """Get the device-to-host (GPU to CPU) transfer stream."""
@@ -444,22 +454,32 @@ def h2d_stream(self):
         """Get the host-to-device (CPU to GPU) transfer stream."""
         return self._h2d_stream
 
+    @property
+    def cuda_graph_stream(self):
+        """Get the CUDA graph stream."""
+        return self._cuda_graph_stream
+
+    @property
+    def cuda_graph_event(self):
+        """Get the CUDA graph event."""
+        return self._cuda_graph_event
+
     @property
     def cpu_tensor_pool(self):
         """Get the shared CPU tensor pool."""
         return self._cpu_tensor_pool
 
-    def push_offload_groups(self, group_hook, forced_released_tensors):
+    def push_offload_groups(self, group_hook, name, forced_released_tensors):
         """Push the offload groups to the delayed queue."""
         debug_rank(f"pushing offload groups to the delayed queue")
-        self._delayed_offload_groups.append((group_hook, forced_released_tensors))
+        self._delayed_offload_groups.append((group_hook, name, forced_released_tensors))
 
     def flush_delayed_groups(self):
         """Flush the delayed groups."""
         debug_rank("flushing delayed groups")
-        # Flush the delayed groups in reverse order to maintain the order of the groups.
-        for group_hook, forced_released_tensors in reversed(self._delayed_offload_groups):
-            group_hook(forced_released_tensors)
+        # Flush the delayed groups in forward order.
+        for group_hook, name, forced_released_tensors in self._delayed_offload_groups:
+            group_hook(name, forced_released_tensors)
         self._delayed_offload_groups = []
 
     def reset(self):
@@ -550,13 +570,41 @@ def post_warmup_callback(self):
                 debug_rank(f"setting offload to false for group {name} at chunk index {chunk_idx}")
             else:
                 break
-        debug_rank(f"offload margin {self._offload_margin}")
         assert self._offload_margin == 0, "Offload margin is not 0"
+        # Disable the groups to meet the delta offload bytes across PP ranks.
+        keep_on_gpu_bytes = self._pp_rank * self._delta_offload_bytes_across_pp_ranks
+        for chunk in self._cached_chunks_backward:
+            for group in chunk.offload_groups:
+                if group.offload and keep_on_gpu_bytes > 0:
+                    debug_rank(
+                        f"group {group._name} offload {group.offload} \
+                        keep_on_gpu_bytes {keep_on_gpu_bytes}"
+                    )
+                    keep_on_gpu_bytes -= group.total_offload_bytes
+                    group.offload = False
+        # Disable the groups to meet the activation offload fraction.
+        for chunk in self._cached_chunks_backward:
+            offloaded_groups_count = 0
+            for group in chunk.offload_groups:
+                if group.offload:
+                    offloaded_groups_count += 1
+            disabled_groups_count = int(
+                offloaded_groups_count * (1 - self._activation_offload_fraction)
+            )
+            debug_rank(f"Disabled {disabled_groups_count}/{offloaded_groups_count} groups")
+            for group in reversed(chunk.offload_groups):
+                if group.offload:
+                    if disabled_groups_count > 0:
+                        disabled_groups_count -= 1
+                        group.offload = False
+                    else:
+                        break
         # Dump the offload information
         total_tensor_count = {}
         total_offload_bytes = {}
         for chunk in self._cached_chunks_forward:
             for group in chunk.offload_groups:
+                debug_rank(f"chunk {chunk} group {group} offload {group.offload}")
                 if group.offload:
                     if group._name not in total_tensor_count:
                         total_tensor_count[group._name] = 0
@@ -568,6 +616,8 @@ def post_warmup_callback(self):
             # where the memory cost will not increase anymore.
             if chunk is self._cached_chunks_backward[0]:
                 break
+        debug_rank(f"total_tensor_count {total_tensor_count}")
+        debug_rank(f"total_offload_bytes {total_offload_bytes}")
         # Cache summary for downstream consumers (e.g., unit tests).
         self._offload_summary_bytes = dict(total_offload_bytes)
         self._offload_summary_total_bytes = int(sum(total_offload_bytes.values()))
@@ -608,15 +658,25 @@ def front_backward_chunk(self, name=None):
         return None
 
     def init_model_chunk_offload_handler(
-        self, vp_size, vp_stage, min_offloaded_tensor_size=1024 * 1024
+        self,
+        pp_rank,
+        vp_size,
+        vp_stage,
+        min_offloaded_tensor_size=1024 * 1024,
+        delta_offload_bytes_across_pp_ranks=0,
+        activation_offload_fraction: float = 1.0,
     ):
         """
         Initialize a chunk offload handler for a model chunk (microbatch).
 
         Args:
+            pp_rank: Pipeline parallel rank
             vp_size: Virtual pipeline size
             vp_stage: Virtual pipeline stage index (None means stage 0)
             min_offloaded_tensor_size: Minimum tensor size (in elements) to offload
+            delta_offload_bytes_across_pp_ranks:
+                Difference of offload bytes across PP ranks to balance the offload load.
+            activation_offload_fraction: Fraction of eligible groups to offload, in range [0, 1].
         """
         if not self._is_warmup:
             return
@@ -626,6 +686,10 @@ def init_model_chunk_offload_handler(
             self._vpp = vp_size
             self._stages = [[] for _ in range(vp_size)]
 
+        self._delta_offload_bytes_across_pp_ranks = delta_offload_bytes_across_pp_ranks
+        self._pp_rank = pp_rank
+        self._activation_offload_fraction = activation_offload_fraction
+
         if vp_stage is None:
             cur_vpp_rank = 0
         else:
@@ -671,10 +735,10 @@ def cur_backward_chunk(self):
         """Get the current backward pass chunk handler."""
         return self._cur_backward_chunk
 
-    def mark_not_offloadable(self, tensor: torch.Tensor):
+    def mark_not_offload(self, tensor: torch.Tensor):
         """Mark the current forward chunk as not offloadable."""
         if tensor is not None:
-            tensor.offloading_activation = False
+            tensor._do_not_offload = True
 
     def __enter__(self):
         """Enter context manager to enable activation offloading hooks."""
@@ -688,10 +752,7 @@ def __enter__(self):
         else:
             raise RuntimeError("TE CPU offload is not available")
         self.inside_context = True
-
-        torch._C._autograd._push_saved_tensors_default_hooks(
-            self.on_save_for_backward, self.on_get_saved_tensor
-        )
+        self._saved_tensors_hooks.__enter__()
 
     def __exit__(self, *args: Any):
         """Exit context manager and restore original tensor saving behavior."""
@@ -705,7 +766,7 @@ def __exit__(self, *args: Any):
         else:
             raise RuntimeError("TE CPU offload is not available")
         self.inside_context = False
-        torch._C._autograd._pop_saved_tensors_default_hooks()
+        self._saved_tensors_hooks.__exit__()
 
     def on_save_for_backward(self, tensor: torch.Tensor) -> Any:
         """
@@ -795,17 +856,17 @@ def reset(self):
         self._tensor_count_current_group = 0
         self._reloading_group = []
 
-    def find_group_with_name(self, name: str, start_index: int = 0):
+    def find_group_with_name(
+        self, groups: list[OffloadTensorGroup], name: str, start_index: int = 0
+    ):
         """Find the group with the given name starting from the given index."""
-        return next(
-            (group for group in self.offload_groups[start_index:] if group._name == name), None
-        )
+        return next((group for group in groups[start_index:] if group._name == name), None)
 
     def is_empty_chunk(self, name=None):
         """Check if this chunk has no tensors to manage."""
         debug_rank(f"------is_empty_chunk {self._max_group_size}")
         if name is not None:
-            return self.find_group_with_name(name) is None
+            return self.find_group_with_name(self.offload_groups, name) is None
         return self._max_group_size == 0
 
     def finish_all_groups(self, name=None) -> bool:
@@ -822,12 +883,15 @@ def finish_all_groups(self, name=None) -> bool:
         ):
             return True
         assert name is not None, "Name is required"
-        return self.find_group_with_name(name, self._offloaded_group_index) is None
+        return (
+            self.find_group_with_name(self.offload_groups, name, self._offloaded_group_index)
+            is None
+        )
 
     def find_next_group(self, name=None):
         """Find the next group with the given name."""
         assert name is not None, "Name is required"
-        return self.find_group_with_name(name, self._offloaded_group_index)
+        return self.find_group_with_name(self.offload_groups, name, self._offloaded_group_index)
 
     def tensor_push(self, tensor):
         """Push tensor to the offload handler."""
@@ -860,17 +924,17 @@ def tensor_pop(self, tensor_tag):
 
     def tensor_need_offloading_checker(self, tensor):
         """Check if the tensor needs to be offloaded."""
-        debug_rank(
-            f"tensor_need_offloading_checker {getattr(tensor, 'offloading_activation', None)}"
-        )
+        debug_rank("tensor_need_offloading_checker")
         if tensor.numel() < self.min_offloaded_tensor_size:
             return False
         # Respect tensor's offload preference if specified
-        if hasattr(tensor, "offloading_activation") and not tensor.offloading_activation:
+        if getattr(tensor, "_TE_do_not_offload", False) or getattr(
+            tensor, "_do_not_offload", False
+        ):
             return False
         return True
 
-    def bulk_offload_group(self):
+    def bulk_offload_group(self, group_to_offload):
         """offload a group of tensors recorded in tensor_push()."""
         debug_rank("------bulk_offload_group")
         group_to_offload = self._groups_to_offload[-1]
@@ -928,10 +992,11 @@ def pre_reload_last_layer(self):
             # Reload the last group (last layer) early
             self.bulk_reload_group()
 
-    def should_bulk_offload(self):
+    def should_bulk_offload(self, name):
         """Determine if the current group should be offloaded."""
         assert len(self._groups_to_offload) > 0, "No groups to offload"
-        group = self._groups_to_offload[-1]
+        group = self.find_group_with_name(self._groups_to_offload, name)
+        assert group is not None, f"Group {name} not found in {self._groups_to_offload}"
         debug_rank(f"should_bulk_offload {self.is_warmup} {group.offload}")
         # Don't offload if the chunk is not in warmup stage
         if self.is_warmup:
@@ -952,12 +1017,16 @@ def should_bulk_offload(self):
 
         return True
 
-    def bulk_offload(self, forced_released_tensors):
+    def bulk_offload(self, name, forced_released_tensors):
         """Offload a group of tensors and optionally release their GPU memory."""
         debug_rank("----bulk_offload")
-        if self.should_bulk_offload():
-            self._groups_to_reload.append(self._groups_to_offload[-1])
-            self.bulk_offload_group()
+        if self.should_bulk_offload(name):
+            group_to_offload = self.find_group_with_name(self._groups_to_offload, name)
+            assert (
+                group_to_offload is not None
+            ), f"Group {name} not found in {self._groups_to_offload}"
+            self._groups_to_reload.append(group_to_offload)
+            self.bulk_offload_group(group_to_offload)
             # Manually release tensors not auto-freed by torch GC
             if len(forced_released_tensors) > 0:
                 cur_stream = torch.cuda.current_stream()
@@ -967,14 +1036,14 @@ def bulk_offload(self, forced_released_tensors):
                         release_tensor.record_stream(cur_stream)
                         release_tensor.untyped_storage().resize_(0)
 
-    def on_group_commit_forward(self, forced_released_tensors):
+    def on_group_commit_forward(self, name, forced_released_tensors):
         """Called at the end of a layer group's forward pass to trigger offloading."""
         if not self.do_offload:
             return
-        debug_rank("--on_group_commit_forward")
+        debug_rank(f"--on_group_commit_forward {name}")
         # Wait for compute to finish before starting offload
         self.d2h_stream.wait_stream(torch.cuda.current_stream())
-        self.bulk_offload(forced_released_tensors)
+        self.bulk_offload(name, forced_released_tensors)
 
     def bulk_reload(self):
         """Reload the next group of tensors from CPU to GPU."""
@@ -1073,12 +1142,12 @@ def forward(ctx, tensor, cur_forward_chunk, name, forced_released_tensors, delay
         # pylint: disable=missing-function-docstring
         debug_rank("FineGrainedOffloadingGroupCommitFunction forward")
 
-        if delay_offload:
+        if delay_offload and PipelineOffloadManager.get_instance()._in_replay:
             PipelineOffloadManager.get_instance().push_offload_groups(
-                cur_forward_chunk.on_group_commit_forward, forced_released_tensors
+                cur_forward_chunk.on_group_commit_forward, name, forced_released_tensors
             )
         else:
-            cur_forward_chunk.on_group_commit_forward(forced_released_tensors)
+            cur_forward_chunk.on_group_commit_forward(name, forced_released_tensors)
         ctx.cpu_offload_handler = cur_forward_chunk
         ctx.name = name
         return tensor
@@ -1175,13 +1244,6 @@ def fine_grained_offloading_group_start(tensor, name=None):
     return FineGrainedOffloadingGroupStartFunction.apply(tensor, cur_forward_chunk, name)
 
 
-def fine_grained_offloading_forward_record(event: torch.cuda.Event) -> None:
-    """Record the forward event for cuda graph capture."""
-    d2h_stream = PipelineOffloadManager.get_instance().d2h_stream
-    torch.cuda.current_stream().record_event(event)
-    torch.cuda.current_stream().wait_stream(d2h_stream)
-
-
 class FineGrainedOffloadingBackwardRecordFunction(torch.autograd.Function):
     """
     Identity operation that marks the end of a layer group for offload synchronization.
@@ -1189,23 +1251,19 @@ class FineGrainedOffloadingBackwardRecordFunction(torch.autograd.Function):
     """
 
     @staticmethod
-    def forward(ctx, tensor, event: torch.cuda.Event) -> torch.Tensor:
+    def forward(ctx, tensor) -> torch.Tensor:
         """Forward pass for cuda graph capture."""
-        ctx.event = event
+        debug_rank("FineGrainedOffloadingBackwardRecordFunction forward")
         return tensor
 
     @staticmethod
     def backward(ctx, grad_output):
         """Record the backward event and wait for the h2d stream on cuda graph stream."""
-        h2d_stream = PipelineOffloadManager.get_instance().h2d_stream
-        torch.cuda.current_stream().record_event(ctx.event)
-        torch.cuda.current_stream().wait_stream(h2d_stream)
-        return grad_output, None
-
-
-def fine_grained_offloading_backward_record(tensor, event: torch.cuda.Event) -> torch.Tensor:
-    """Record the backward event for cuda graph capture."""
-    return FineGrainedOffloadingBackwardRecordFunction.apply(tensor, event)
+        debug_rank("FineGrainedOffloadingBackwardRecordFunction backward")
+        mgr = PipelineOffloadManager.get_instance()
+        torch.cuda.current_stream().record_event(mgr.cuda_graph_event)
+        torch.cuda.current_stream().wait_stream(mgr.h2d_stream)
+        return (grad_output,)
 
 
 class FineGrainedActivationOffloadingInterface:
@@ -1229,10 +1287,32 @@ def __exit__(self, *args: Any):
             PipelineOffloadManager.get_instance().__exit__()
 
     @staticmethod
-    def init_chunk_handler(vp_size, vp_stage, min_offloaded_tensor_size):
+    def cuda_graph_stream():
+        """Get the CUDA graph stream."""
+        return PipelineOffloadManager.get_instance().cuda_graph_stream
+
+    @staticmethod
+    def cuda_graph_event():
+        """Get the CUDA graph event."""
+        return PipelineOffloadManager.get_instance().cuda_graph_event
+
+    @staticmethod
+    def init_chunk_handler(
+        pp_rank,
+        vp_size,
+        vp_stage,
+        min_offloaded_tensor_size,
+        delta_offload_bytes_across_pp_ranks,
+        activation_offload_fraction,
+    ):
         """Initialize the chunk handler, called at the start of a microbatch forward pass."""
         PipelineOffloadManager.get_instance().init_model_chunk_offload_handler(
-            vp_size, vp_stage, min_offloaded_tensor_size
+            pp_rank,
+            vp_size,
+            vp_stage,
+            min_offloaded_tensor_size,
+            delta_offload_bytes_across_pp_ranks,
+            activation_offload_fraction,
         )
 
     @staticmethod
@@ -1240,24 +1320,37 @@ def get_context(flag):
         """Get the fine-grained offload context"""
         return PipelineOffloadManager.get_instance() if flag else nullcontext()
 
+    def group_offload(self, tensor, forced_released_tensors=None, delay_offload=False):
+        """Group offload the tensors."""
+        if self.offload:
+            return fine_grained_offloading_group_commit(
+                tensor, self.name, forced_released_tensors, delay_offload
+            )
+        return tensor
+
     @staticmethod
     def group_commit(tensor, name, forced_released_tensors=None, delay_offload=False):
-        """Group commit the tensors."""
+        """Static variant of group_offload used by main's multi_latent_attention."""
         return fine_grained_offloading_group_commit(
             tensor, name, forced_released_tensors, delay_offload
         )
 
     @staticmethod
-    def mark_not_offloadable(tensor: torch.Tensor):
+    def mark_not_offload(tensor: torch.Tensor):
         """Mark the tensor as not offloadable."""
-        PipelineOffloadManager.get_instance().mark_not_offloadable(tensor)
+        PipelineOffloadManager.get_instance().mark_not_offload(tensor)
 
     @staticmethod
-    def forward_record(event: torch.cuda.Event) -> None:
+    def forward_record() -> None:
         """Record the forward event for cuda graph capture."""
-        d2h_stream = PipelineOffloadManager.get_instance().d2h_stream
-        torch.cuda.current_stream().record_event(event)
-        torch.cuda.current_stream().wait_stream(d2h_stream)
+        mgr = PipelineOffloadManager.get_instance()
+        torch.cuda.current_stream().record_event(mgr.cuda_graph_event)
+        torch.cuda.current_stream().wait_stream(mgr.d2h_stream)
+
+    @staticmethod
+    def backward_record(tensor) -> torch.Tensor:
+        """Record the backward event for cuda graph capture."""
+        return FineGrainedOffloadingBackwardRecordFunction.apply(tensor)
 
     @staticmethod
     def reset():
@@ -1268,3 +1361,28 @@ def reset():
     def reset_instance():
         """Reset the singleton instance."""
         PipelineOffloadManager.reset_instance()
+
+    @staticmethod
+    def flush_delayed_groups():
+        """Flush the delayed groups."""
+        PipelineOffloadManager.get_instance().flush_delayed_groups()
+
+    @staticmethod
+    def disable_offload():
+        """Disable the offload."""
+        PipelineOffloadManager.get_instance().disable_offload()
+
+    @staticmethod
+    def enable_offload():
+        """Enable the offload."""
+        PipelineOffloadManager.get_instance().enable_offload()
+
+    @staticmethod
+    def enter_replay():
+        """Enter CUDA graph replay mode to enable delayed offloading."""
+        PipelineOffloadManager.get_instance()._in_replay = True
+
+    @staticmethod
+    def exit_replay():
+        """Exit CUDA graph replay mode."""
+        PipelineOffloadManager.get_instance()._in_replay = False
diff --git a/megatron/core/pipeline_parallel/hybrid_cp_schedule.py b/megatron/core/pipeline_parallel/hybrid_cp_schedule.py
deleted file mode 100644
index 27b5fc87945..00000000000
--- a/megatron/core/pipeline_parallel/hybrid_cp_schedule.py
+++ /dev/null
@@ -1,660 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-
-from collections import deque
-from functools import lru_cache
-from math import ceil, log2
-from typing import Callable, List, Optional, Tuple
-
-import torch
-
-from megatron.core import parallel_state
-from megatron.core.rerun_state_machine import RerunDataIterator
-
-
-class BalancedCPScheduler:
-    """
-    This class provides the functionality to form groups of sub-samples
-    such that all DPxCP ranks have a roughly balanced workload in the group.
-    """
-
-    def __init__(self, max_seq_len_per_rank: int, dp_cp_group: torch.distributed.ProcessGroup):
-        self.max_seq_len_per_rank = max_seq_len_per_rank
-        self.num_subsamples = 0
-        self.num_subsamples_processed = 0
-        self.free_resources = []
-        self.total_hdp_gpus = dp_cp_group.size()
-
-    @lru_cache(maxsize=128)
-    def get_total_workload(self, seq_length: int, cp_size: Optional[int] = None):
-        """
-        seq_length: sequence length of a sub-sample
-        cp_size: total number of CP ranks working on this sub-sample
-
-        Note:
-        This function is used to estimate the relative workload intensity
-        of a sub-sample. This is not meant to be an accurate flops calculator.
-
-        Returns: workload of a sub-sample
-        """
-        if cp_size is None:
-            cp_size = self.gpus_needed(seq_length)
-        return (seq_length * seq_length) / cp_size
-
-    @lru_cache(maxsize=128)
-    def gpus_needed(self, seq_len: int) -> int:
-        """
-        Calculates the number of GPUs needed for a given sequence length
-        and max sequence length per CP rank.
-        This is used to determine the CP size of a sub-sample.
-
-        The number is rounded up to the next power of 2 to match the available
-        hybrid context parallel process group sizes.
-        """
-        return max(1, 2 ** ceil(log2((seq_len / self.max_seq_len_per_rank))))
-
-    def make_buckets_equal(
-        self,
-        sample_seqlens: List[Tuple[int, int]],  # List of (sample_id, sequence_length) tuples
-        compute_estimator: Callable[[int], float],
-    ) -> List[deque]:
-        """
-        Makes as many buckets as unique CP sizes needed.
-        This keeps sample IDs tethered to their sequence lengths throughout the bucketing process.
-        """
-        # Extract just the sequence lengths for determining k
-        seqlens = [seq_len for _, seq_len in sample_seqlens]
-
-        # Determine k based on unique GPU categories needed
-        k = len({self.gpus_needed(L) for L in seqlens})
-
-        # Create a work target for each bucket
-        # This is the total work divided by the number of buckets
-        work = []
-        for _, s in sample_seqlens:
-            cp_size = self.gpus_needed(s)
-            work.append(compute_estimator(s, cp_size))
-        total_work = sum(work)
-        target = total_work / k
-        buckets, cur, cur_work = [], [], 0.0
-        remaining_work = total_work
-        remaining_k = k
-
-        for i, (sample_id, seq_len) in enumerate(sample_seqlens):
-            work = compute_estimator(seq_len)
-            projected = cur_work + work
-
-            # Check if we should close this bucket
-            if cur and (
-                projected > target * 1.1  # Too much work
-                or len(sample_seqlens) - i <= remaining_k - len(buckets)
-            ):  # Need to save sequences for remaining buckets
-                buckets.append(deque(cur))
-                cur, cur_work = [], 0.0
-                remaining_work -= sum(compute_estimator(seq_len) for _, seq_len in cur)
-                remaining_k -= 1
-
-            cur.append((sample_id, seq_len))
-            cur_work += work
-
-        if cur:
-            buckets.append(deque(cur))
-
-        return buckets
-
-    def next_hdp_group(
-        self,
-        sample_seqlens: List[Tuple[int, int]],  # List of (sample_id, sequence_length) tuples
-        compute_estimator: Callable[[int], float],
-        total_gpus: int,
-        delta: float = 0.05,  # balance slack (e.g. 5 %)
-        strategy: str = "dp",  # "dp" or "pp"
-        eps_bucket: float = 0.10,  # ε target for bucket balance
-    ) -> Tuple[List[List[int]], List[Tuple[int, int]], List[float], List[List[int]]]:
-        """
-        Given a list of (sample_id, sequence_length) tuples, this function aims to assign
-        sequences in a group such that all GPUs in the DPxCP group have a roughly balanced
-        workload. Once each group is roughly balanced, we exit and return the
-        group and the leftover sequences.
-
-        The function performs the following passes in order to form a balanced microbatch:
-        1. We create buckets of sequences that are roughly balanced.
-        We try to create as many buckets as possible CP sizes.
-        2. Given a bucket has sequences available, we assign the sample
-            a. To a new set of GPUs if there are enough free GPUs.
-            b. To an existing set of GPUs with the lowest load.
-        3. We check if the group is balanced whenever we need to move onto a new CP size
-        in the same set of GPUs.
-        4. We trim the group if removing the last added sequence helps improve balance.
-        5. If we run out of sequences to assign and there are empty GPUs,
-        we redistribute work to empty GPUs by recursively increasing the CP size of a
-        sample until no empty GPUs are left.
-
-        Returns (micro_batches, leftover_sample_seqlens, exec_times, sample_ids_per_gpu).
-        """
-        if not sample_seqlens:
-            return (
-                [[] for _ in range(total_gpus)],
-                [],
-                [0.0 for _ in range(total_gpus)],
-                [[] for _ in range(total_gpus)],
-            )
-
-        # Get buckets of sequences with balanced work
-        buckets = self.make_buckets_equal(sample_seqlens, compute_estimator)
-
-        # Initialize tracking structures
-        micro_batches = [[] for _ in range(total_gpus)]
-        exec_times = [0.0 for _ in range(total_gpus)]
-        sample_ids_per_gpu = [[] for _ in range(total_gpus)]
-
-        gpu_group_id = [None] * total_gpus
-        group_members = {}
-        group_size = {}
-        next_gid = 0
-
-        pp_cursor = 0
-        prev_needed = None
-        check_balance = False
-
-        while buckets:
-            # ---- Step 1 – pick the next sequence we COULD place ------------------
-            sample_seq_tuple = bucket_idx = None
-            needed = None
-
-            scan_order = (
-                range(len(buckets))
-                if strategy == "dp"
-                else [(pp_cursor + i) % len(buckets) for i in range(len(buckets))]
-            )
-
-            for idx in scan_order:
-                if not buckets[idx]:
-                    continue
-                cand_tuple = buckets[idx][0]  # This is now (sample_id, seq_len)
-                cand_seq_len = cand_tuple[1]
-                needed = self.gpus_needed(cand_seq_len)
-
-                # (a) Do we have an *existing* group of size `needed`?
-                candidate_gids = [gid for gid, sz in group_size.items() if sz == needed]
-
-                # (b) Or enough completely free GPUs to start a new group?
-                free_ranks = [r for r, gid in enumerate(gpu_group_id) if gid is None]
-                if candidate_gids or len(free_ranks) >= needed:
-                    sample_seq_tuple, bucket_idx = cand_tuple, idx
-                    break
-
-            # No place to put any remaining sequence – finish this micro‑batch
-            if sample_seq_tuple is None:
-                break
-
-            # TODO[pmannan]: PP not yet supported. Add PP scheduling.
-            if strategy == "pp":
-                pp_cursor = (bucket_idx + 1) % len(buckets)
-
-            sample_id, seq_len = sample_seq_tuple
-            needed = self.gpus_needed(seq_len)
-            if prev_needed is None:
-                prev_needed = needed
-
-            # (a)  Existing groups of exactly this size
-            candidate_gids = [gid for gid, sz in group_size.items() if sz == needed]
-            if candidate_gids:
-                best_gid, best_load = min(
-                    (
-                        (gid, max(exec_times[r] for r in group_members[gid]))
-                        for gid in candidate_gids
-                    ),
-                    key=lambda t: t[1],
-                )
-            else:
-                best_gid, best_load = None, float("inf")
-
-            # (b)  Hypothetical **new** group from completely free GPUs
-            free_ranks = [r for r, gid in enumerate(gpu_group_id) if gid is None]
-            if len(free_ranks) >= needed:
-                free_sorted = sorted(free_ranks, key=lambda r: exec_times[r])
-                new_members = free_sorted[:needed]
-                new_load = exec_times[new_members[-1]]
-
-                if new_load < best_load:
-                    best_gid = None
-                    chosen_members = new_members
-                else:
-                    chosen_members = group_members[best_gid]
-            else:
-                chosen_members = group_members[best_gid]
-
-            # ---- Step 2 – if we decided to create a fresh group ----------------
-            if best_gid is None:
-                best_gid = next_gid
-                next_gid += 1
-                group_members[best_gid] = chosen_members
-                group_size[best_gid] = needed
-                for r in chosen_members:
-                    gpu_group_id[r] = best_gid
-
-            # ---- Step 3 – assign the sequence to every member of that group ------
-            per_gpu_cost = compute_estimator(seq_len)
-
-            for r in chosen_members:
-                micro_batches[r].append(seq_len)
-                exec_times[r] += per_gpu_cost
-                sample_ids_per_gpu[r].append(sample_id)
-
-            # Remove the sequence definitively from its bucket
-            buckets[bucket_idx].popleft()
-
-            # ---- Step 4 – tidy, balance‑check, maybe early‑exit ------------------
-            while buckets and not buckets[0]:
-                buckets.pop(0)
-                pp_cursor %= max(1, len(buckets))
-
-            # TODO: Removing this helps reduce the number of groups when we have
-            # lots of samples with same CP size.
-            # But because we don't exit as soon as we get balanced,
-            # even if there is one group available that can take the next sample,
-            # we will keep adding samples to the same group.
-            # trim_overload() does not help because it only checks if removing the
-            # last added sample helps.
-            # We cannot check after adding every sample because there will always be imbalance
-            # if we don't wait for future scheduling.
-
-            # IMPORTANT: So we need a solution here
-            if needed < prev_needed:
-                # When we get into a lower CP size in the same group,
-                # we can start checking for balance. There is still a gotcha here.
-                # Let's say we have a group of 3 GPU 0-2, then we move onto group of 2.
-                # We keep assigning group of 2 as we do in descending order but GPU 7/15
-                # never sees a microbatch assigned to it
-                # until we run out of samples with CP2.
-                # This means we are never balanced as min(exec_times) will always be 0.
-                # We need a smart way of identifying that we have run out of big samples
-                # and if we are having to assign work to a GPU already working,
-                # is it because there are empty GPUs?
-                # Would assigning work to empty GPUs first by moving onto next CP bucket help?
-                # But we need to remember to come back to this CP size bucket and then
-                # check for balance. Maybe the scheduling algorithm should look at empty
-                # GPUs and find work rather than going sequence by sequence.
-                check_balance = True
-
-            if (
-                check_balance
-                and buckets
-                and max(exec_times) - min(exec_times) <= delta * max(exec_times)
-            ):
-                break
-
-        # Gather leftovers (flatten remaining buckets, preserve order)
-        leftovers = []
-        for b in buckets:
-            for sample_seq_tuple in b:
-                leftovers.append(sample_seq_tuple)
-
-        # ---------------------------------------------------------------------------
-        def trim_overload():
-            """
-            Iteratively pop the most‑recent sequence from the *most‑loaded group*
-            whenever doing so reduces the global slack.
-            """
-            while True:
-                cur_max = max(exec_times)
-                cur_min = min(exec_times)
-                cur_slack = cur_max - cur_min
-                if cur_slack <= delta * cur_max:
-                    # Slack is already within limit.
-                    break
-                if cur_min == 0:
-                    # There are empty GPUs that will be
-                    # handled in the next step.
-                    break
-
-                max_r = exec_times.index(cur_max)
-                gid = gpu_group_id[max_r]
-                members = group_members[gid]
-
-                if not micro_batches[max_r] or len(micro_batches[max_r]) <= 1:
-                    break
-
-                seq = micro_batches[max_r][-1]
-                need = group_size[gid]
-                per_gpu_cost = compute_estimator(seq)
-
-                proj_times = exec_times[:]
-                for r in members:
-                    proj_times[r] -= per_gpu_cost
-
-                proj_slack = max(proj_times) - min(proj_times)
-
-                # Check if trimming the workload helps imbalance
-                if proj_slack < cur_slack:
-                    sample_id_to_remove = sample_ids_per_gpu[max_r][-1]
-                    for r in members:
-                        micro_batches[r].pop()
-                        exec_times[r] -= per_gpu_cost
-                        sample_ids_per_gpu[r].pop()
-                    leftovers.append((sample_id_to_remove, seq))
-                else:
-                    break
-
-        trim_overload()
-
-        # Track samples in this group before redistribution to empty GPUs
-        total_work_before = sum(len(mb) for mb in micro_batches)
-
-        # Check for empty GPUs and redistribute work
-        def fill_empty_gpus(
-            micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size
-        ):
-            """
-            Recursively check for empty GPUs and redistribute work by increasing
-            the number of GPUs sharing samples. This ensures all GPUs have work.
-            GPUs must be allocated consecutively so we may need to push existing
-            work to other ranks in order to expand samples.
-            """
-            # Find empty GPUs
-            empty_gpus = [i for i in range(total_gpus) if not micro_batches[i]]
-            if not empty_gpus:
-                return (
-                    micro_batches,
-                    exec_times,
-                    sample_ids_per_gpu,
-                    group_members,
-                    group_size,
-                )  # No empty GPUs, we're done
-
-            # Find the smallest group size that exists
-            existing_group_sizes = set(group_size.values())
-            assert (
-                existing_group_sizes
-            ), "There should be at least one group existing, cannot reditribute, "
-            "try to increase 'max-seqlen-per-cp-rank'."
-
-            min_group_size = min(existing_group_sizes)
-            # We have Hybrid DPxCP groups for every power of 2 of GPUs or the entire DPxCP group.
-            next_power = min(min_group_size * 2, total_gpus)
-
-            # Find the first group of min_group_size that can be expanded
-            expandable_gid = None
-            expandable_members = None
-            expandable_new_gpus = None
-
-            for gid, size in group_size.items():
-                if size == min_group_size:
-                    members = group_members[gid]
-                    needed_count = next_power - min_group_size
-                    group_start_gpu = members[0]
-                    group_end_gpu = members[-1]
-                    empty_gpu = [idx for idx, work in enumerate(micro_batches) if not work][0]
-                    assert not all(
-                        work for work in micro_batches[empty_gpu : empty_gpu + needed_count]
-                    ), f"Empty GPUs were detected but not enough to expand."
-                    work_to_push = micro_batches[
-                        group_end_gpu + 1 : empty_gpu
-                    ]  # This is work of all other subsequent sub-samples
-                    exec_times_to_push = exec_times[group_end_gpu + 1 : empty_gpu]
-                    sample_ids_to_push = sample_ids_per_gpu[group_end_gpu + 1 : empty_gpu]
-
-                    new_micro_batches = [[]] * len(micro_batches)
-                    new_exec_times = [0.0] * len(exec_times)
-                    new_sample_ids_per_gpu = [[]] * len(sample_ids_per_gpu)
-
-                    # No change in work until the group selected for expansion
-                    for i in range(group_start_gpu):
-                        new_micro_batches[i] = micro_batches[i]
-                        new_exec_times[i] = exec_times[i]
-                        new_sample_ids_per_gpu[i] = sample_ids_per_gpu[i]
-
-                    # The work is distributed across the expanded group
-                    for i in range(group_start_gpu, group_end_gpu + needed_count + 1):
-                        new_micro_batches[i] = micro_batches[group_end_gpu]
-                        new_exec_times[i] = self.get_total_workload(
-                            micro_batches[group_end_gpu][0], next_power
-                        )
-                        new_sample_ids_per_gpu[i] = sample_ids_per_gpu[group_end_gpu]
-
-                    # Any assigned work on expanded GPUs is pushed
-                    for i, work in enumerate(work_to_push):
-                        new_micro_batches[group_end_gpu + needed_count + 1 + i] = work
-                        new_exec_times[group_end_gpu + needed_count + 1 + i] = exec_times_to_push[i]
-                        new_sample_ids_per_gpu[group_end_gpu + needed_count + 1 + i] = (
-                            sample_ids_to_push[i]
-                        )
-
-                    group_size[gid] = next_power
-                    group_members[gid] = list(range(members[0], members[-1] + needed_count + 1))
-                    for pushed_gid in group_size.keys():
-                        if pushed_gid > gid:
-                            group_members[pushed_gid] = [
-                                x + needed_count for x in group_members[pushed_gid]
-                            ]
-
-                    return (
-                        new_micro_batches,
-                        new_exec_times,
-                        new_sample_ids_per_gpu,
-                        group_members,
-                        group_size,
-                    )
-
-        empty_gpus = any([not micro_batches[i] for i in range(total_gpus)])
-        while empty_gpus:
-            micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size = (
-                fill_empty_gpus(
-                    micro_batches, exec_times, sample_ids_per_gpu, group_members, group_size
-                )
-            )
-            empty_gpus = any([not micro_batches[i] for i in range(total_gpus)])
-
-        # Assert that no sample has been completely removed
-        total_work_after = sum(len(mb) for mb in micro_batches)
-        assert (
-            total_work_after >= total_work_before
-        ), f"Samples were removed: {total_work_before} -> {total_work_after}"
-
-        return micro_batches, leftovers, exec_times, sample_ids_per_gpu
-
-    def get_groups_and_subsamples(self, sample_id_seqlens, config):
-        """
-        This function recursively forms groups of sub-samples such that all DPxCP ranks
-        have a roughly balanced workload in the group.
-        """
-        groups = []
-        sample_id_groups = []
-        # We assign a sample_id to each sub-sample in order to track assignment to each GPU.
-        sample_id_seqlens = sorted(sample_id_seqlens, key=lambda x: x[1], reverse=True)
-        while sample_id_seqlens:
-            mb, sample_id_seqlens, exec_times, sample_ids = self.next_hdp_group(
-                sample_id_seqlens, self.get_total_workload, self.total_hdp_gpus
-            )
-            groups.append(mb)
-            if len(sample_ids) < self.total_hdp_gpus:
-                sample_ids.extend([] * (self.total_hdp_gpus - len(sample_ids)))
-            sample_id_groups.append(sample_ids)
-
-        return groups, sample_id_groups
-
-
-def hybrid_context_parallel_forward_backward(
-    forward_step_func,
-    data_iterator,
-    model,
-    num_microbatches,
-    input_tensor,
-    output_tensor_grad,
-    forward_data_store,
-    config,
-    collect_non_loss_data,
-    first_val_step,
-    forward_only,
-    no_sync_func,
-    total_num_tokens,
-    check_first_val_step,
-    model_type,
-):
-    """
-    Scheduler for Hybrid Context Parallel.
-
-    This function performs the packed sample scheduling and determines
-    1. The number of microbatches to schedule for each CP rank
-    2. The number of groups each CP rank should execute
-    3. The number of sub-samples per group each CP rank should execute
-
-    A group is defined by a set of samples that can run across the CP domain without any barrier.
-    There are many reasons why we may not be able to run endless samples within a single group.
-    For example, if we have 8 GPUs,
-    if GPU 0-5 are assigned a long sample that requires CP6,
-    GPU 6-7 are assigned a short sample that requires CP2,
-    The next sample which requires CP4 can be assigned GPU 4-7.
-    But GPU 6-7 will finish first and get deadlocked if GPU 4-5 are not participating in the group.
-    """
-    from .schedules import backward_step, forward_step
-
-    def _broadcast(item):
-        if item is not None:
-            torch.distributed.broadcast(
-                item,
-                parallel_state.get_tensor_model_parallel_src_rank(),
-                group=parallel_state.get_tensor_model_parallel_group(),
-            )
-
-    def _broadcast_num_samples_this_group(num_samples_this_group):
-        dev = torch.cuda.current_device()
-        torch.distributed.barrier()
-
-        n = 0 if num_samples_this_group is None else int(num_samples_this_group.numel())
-        n = torch.tensor([n], dtype=torch.int64, device=dev)
-
-        _broadcast(n)
-        n = int(n.item())
-
-        assert n > 0, "there should be at least 1 sub samples in the group"
-        num_samples_this_group_broadcast = (
-            torch.empty(n, dtype=torch.int32, device=dev)
-            if num_samples_this_group is None
-            else num_samples_this_group
-        )
-        _broadcast(num_samples_this_group_broadcast)
-        return num_samples_this_group_broadcast
-
-    def _get_new_data_iterator(sample_id_in_group, group_id):
-        if is_first_tp_rank:
-            sub_sample_id = sample_ids_this_group[sample_id_in_group]
-            sample = batch[sub_sample_id]
-            partner_cp_size = len(
-                [True for sample_ids in sample_id_groups[group_id] if sub_sample_id in sample_ids]
-            )
-            sample["local_cp_size"] = torch.tensor(partner_cp_size, dtype=torch.int32)
-            new_data_iterator = RerunDataIterator(iter([sample]))
-            return new_data_iterator
-        else:
-            return None
-
-    # We get data once per global batch and schedule the sub-samples.
-    # TODO(pmannan): Should we wrap the data_iterator here instead of the training.py file?
-    hdp_rank = parallel_state.get_data_parallel_rank(with_context_parallel=True)
-    is_first_tp_rank = parallel_state.get_tensor_model_parallel_rank() == 0
-
-    if is_first_tp_rank:
-        data = next(data_iterator)
-        sample_id_groups = data[1]
-        batch = data[0]
-    else:
-        data, sample_id_groups, batch = None, None, None
-
-    num_samples_this_group = None
-    if is_first_tp_rank:
-        num_samples_this_group = torch.tensor(
-            [len(group[hdp_rank]) for group in sample_id_groups], dtype=torch.int32, device='cuda'
-        )
-
-    num_samples_this_group = _broadcast_num_samples_this_group(num_samples_this_group)
-    num_samples_this_group = num_samples_this_group.cpu().numpy()
-    num_total_groups = num_samples_this_group.shape[0]
-
-    current_microbatch = 0
-
-    # Upto last group, we don't need any sync.
-    with no_sync_func():
-        for j in range(num_total_groups - 1):
-            sample_ids_this_group = sample_id_groups[j][hdp_rank] if is_first_tp_rank else None
-            for i in range(num_samples_this_group[j]):
-                # Call forward step for each sub-sample
-                new_data_iterator = _get_new_data_iterator(i, j)
-                # TODO: Find the usage of current_microbatch and is_first_microbatch and
-                # how that may affect my usage.
-                output_tensor, num_tokens = forward_step(
-                    forward_step_func,
-                    new_data_iterator,
-                    model,
-                    num_microbatches,
-                    input_tensor,
-                    forward_data_store,
-                    config,
-                    collect_non_loss_data,
-                    is_first_microbatch=check_first_val_step(
-                        first_val_step, forward_only, current_microbatch == 0
-                    ),
-                    current_microbatch=current_microbatch,
-                )
-                current_microbatch += 1
-                total_num_tokens += num_tokens.item()
-                if not forward_only:
-                    backward_step(
-                        input_tensor, output_tensor, output_tensor_grad, model_type, config
-                    )
-
-            # Create a barrier at end of each group.
-            # This barrier ensures that all ranks are prepared to change assigned CP group sizes and
-            # no rank is starting a sub-sample ahead of it's partner ranks.
-            torch.distributed.barrier(
-                parallel_state.get_data_parallel_group(with_context_parallel=True)
-            )
-
-    # For the last group, we need to run the last sub-sample out of the context handler.
-    with no_sync_func():
-        sample_ids_this_group = sample_id_groups[-1][hdp_rank] if is_first_tp_rank else None
-        for i in range(num_samples_this_group[-1] - 1):
-            new_data_iterator = _get_new_data_iterator(i, -1)
-            # Call forward step for each sub-sample
-            output_tensor, num_tokens = forward_step(
-                forward_step_func,
-                new_data_iterator,
-                model,
-                num_microbatches,
-                input_tensor,
-                forward_data_store,
-                config,
-                collect_non_loss_data,
-                is_first_microbatch=check_first_val_step(
-                    first_val_step, forward_only, current_microbatch == 0
-                ),
-                current_microbatch=current_microbatch,
-            )
-            current_microbatch += 1
-            total_num_tokens += num_tokens.item()
-            if not forward_only:
-                backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
-
-    # The last sub-sample of the last group of the last microbatch is
-    # run out of the context handler.
-    new_data_iterator = _get_new_data_iterator(-1, -1)
-    # Call forward step for each sub-sample
-    output_tensor, num_tokens = forward_step(
-        forward_step_func,
-        new_data_iterator,
-        model,
-        num_microbatches,
-        input_tensor,
-        forward_data_store,
-        config,
-        collect_non_loss_data,
-        is_first_microbatch=check_first_val_step(
-            first_val_step, forward_only, current_microbatch == 0
-        ),
-        current_microbatch=current_microbatch,
-    )
-    total_num_tokens += num_tokens.item()
-    if not forward_only:
-        backward_step(input_tensor, output_tensor, output_tensor_grad, model_type, config)
-
-    return forward_data_store, total_num_tokens
diff --git a/megatron/core/pipeline_parallel/schedules.py b/megatron/core/pipeline_parallel/schedules.py
index 14fc6041574..faeba1aeb7d 100644
--- a/megatron/core/pipeline_parallel/schedules.py
+++ b/megatron/core/pipeline_parallel/schedules.py
@@ -1,7 +1,8 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import contextlib
 from functools import partial
+from itertools import zip_longest
 from typing import Callable, Dict, Iterator, List, Optional, Union
 
 import torch
@@ -25,6 +26,7 @@
 )
 from megatron.core.transformer.cuda_graphs import create_cudagraphs, set_current_microbatch
 from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.moe.paged_stash import paged_stash_reset
 from megatron.core.transformer.moe.router import MoEAuxLossAutoScaler
 from megatron.core.utils import (
     drain_embedding_wgrad_compute,
@@ -39,7 +41,6 @@
     combined_1f1b_schedule_for_interleaved_pipelining,
     combined_1f1b_schedule_for_no_pipelining,
 )
-from .hybrid_cp_schedule import hybrid_context_parallel_forward_backward
 
 # Types
 Shape = Union[List[int], torch.Size]
@@ -301,6 +302,8 @@ def forward_step_calc_loss(
         if config.calculate_per_token_loss:
             MoEAuxLossAutoScaler.set_loss_scale(loss_scale)
         else:
+            # See https://github.com/NVIDIA/Megatron-LM/pull/2217 for detailed explanation
+            # of scaling by cp_group_size
             cp_size_for_scaling = cp_group_size if cp_group_size is not None else 1
             MoEAuxLossAutoScaler.set_loss_scale(loss_scale * cp_size_for_scaling / num_microbatches)
 
@@ -644,6 +647,9 @@ def forward_backward_no_pipelining(
     if config.timers is not None:
         config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
 
+    if getattr(config, "moe_paged_stash", False):
+        paged_stash_reset(enabled=not forward_only, config=config)
+
     no_sync_func = config.no_sync_func
     if no_sync_func is None:
         no_sync_func = contextlib.nullcontext
@@ -671,24 +677,6 @@ def forward_backward_no_pipelining(
             total_num_tokens,
             partial(check_first_val_step, first_val_step, forward_only),
         )
-    elif config.hybrid_context_parallel:
-        forward_data_store, total_num_tokens = hybrid_context_parallel_forward_backward(
-            forward_step_func,
-            data_iterator,
-            model,
-            num_microbatches,
-            input_tensor,
-            output_tensor_grad,
-            forward_data_store,
-            config,
-            collect_non_loss_data,
-            first_val_step,
-            forward_only,
-            no_sync_func,
-            total_num_tokens,
-            check_first_val_step,
-            model_type,
-        )
     else:
         with no_sync_func():
             for i in range(num_microbatches - 1):
@@ -891,6 +879,136 @@ def get_schedule_table(num_microbatches, num_model_chunks, microbatch_group_size
     return schedule_table
 
 
+def convert_schedule_table_to_order(num_warmup_microbatches, num_model_chunks, schedule_table):
+    """Convert a tunable schedule lookup table to the te.make_graphed_callables() accepted
+    order format. For example, the tunable schedule table for PP2 N3M5 with VP2 is as below:
+    virtual_microbatch_id | 0 1 2 3 4 5 6 7 8 9
+    microbatch_id         | 0 1 2 0 1 2 3 4 3 4
+    model_chunk_id        | 0 0 0 1 1 1 0 0 1 1
+
+    Then the forward backward separated order is:
+    forward               | 1 1 1 2 2 2 1 1 2 2
+    backward              | -2 -2 -2 -1 -1 -1 -2 -2 -1 -1
+
+    If num_warmup_microbatches is 5, the output order is:
+    1 1 1 2 2 2 -2 1 -2 1 -2 2 -1 2 -1 -1 -2 -2 -1 -1
+    """
+    _, model_chunk_id_table = zip(*schedule_table)
+    forward_order = [chunk_id + 1 for chunk_id in model_chunk_id_table]
+    backward_order = [chunk_id - num_model_chunks for chunk_id in model_chunk_id_table]
+    order = forward_order[:num_warmup_microbatches]
+    for i in range(num_warmup_microbatches, len(forward_order)):
+        order.append(forward_order[i])
+        order.append(backward_order[i - num_warmup_microbatches])
+    if num_warmup_microbatches > 0:
+        order.extend(backward_order[-num_warmup_microbatches:])
+    return order
+
+
+def get_overlap_moe_expert_parallel_comm_order(order, num_layers_per_chunk, capture_wgrad_graph):
+    """
+    This functions gets the order for overlap_moe_expert_parallel_comm schedule for the original
+    chunk-wise order list. Each chunk is transformered to chunks with only 1 layer so that
+    layers between 2 chunks can now overlap with each other while following the graph order.
+    If capture_wgrad_graph is True, the wgrad backward graph is also added to the order by
+    decreasing the layer id by 0.5.
+
+    Args:
+        order (List[int]): The original chunk-wise order list. Positive values represent forward
+            passes for chunks, negative values represent backward passes. The absolute value
+            indicates the chunk ID (1-indexed).
+        num_layers_per_chunk (List[int]): Number of graphable layers in each chunk. The length
+            of this list equals the number of chunks.
+        capture_wgrad_graph (bool): If True, weight gradient computation graphs are added to the
+            order by appending entries with layer_id - 0.5.
+
+    Returns:
+        Tuple[List[float], List[Optional[List[int]]]]: A tuple containing:
+            - new_order: The layer-wise order list where each chunk is expanded to individual
+              layers. Positive values are forward passes, negative values are backward passes.
+              Values with .5 suffix indicate weight gradient computations.
+            - chunk_id_list: A list parallel to new_order. For forward passes, contains
+              [chunk_id, layer_index_within_chunk]. For backward passes, contains None.
+
+    Example:
+        original_order: [1, 2, -2, 1, -1, -1]
+        num_layers_per_chunk: [1, 2]
+        capture_wgrad_graph=True:
+            new_order: [1, 2, 3, 1, -3, -3.5, -2, -2.5, -1, -1.5, -1, -1.5]
+            chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None,
+                            None, None, None, None, None, None, None]
+        capture_wgrad_graph=False:
+            new_order: [1, 2, 3, 1, -3, -2, -1, -1]
+            chunk_id_list: [[0, 0], [1, 0], [1, 1], [0, 0], None, None, None, None]
+    """
+
+    def _add_order(new_order, chunk_id_list, c_id, layer_id, is_wgrad=False, index=None):
+        if is_wgrad:
+            new_order.append(layer_id - 0.5)
+        else:
+            new_order.append(layer_id)
+        if c_id > 0:
+            chunk_id_list.append([abs(c_id) - 1, index])
+        else:
+            chunk_id_list.append(None)
+
+    new_order = []
+    chunk_id_list = []
+    add_order = partial(_add_order, new_order, chunk_id_list)
+    first_backward_idx, last_forward_idx = None, None
+    for idx, c_id in enumerate(order):
+        if first_backward_idx is None and c_id < 0:
+            first_backward_idx = idx
+        if c_id > 0:
+            last_forward_idx = idx
+
+    def get_layer_range(c_id):
+        num_layers = num_layers_per_chunk[abs(c_id) - 1]
+        num_layers_previous_chunks = sum(num_layers_per_chunk[: abs(c_id) - 1])
+        if c_id > 0:
+            return list(
+                range(num_layers_previous_chunks + 1, num_layers_previous_chunks + num_layers + 1)
+            )
+        return list(range(-num_layers_previous_chunks - num_layers, -num_layers_previous_chunks))
+
+    # warmup stage
+    for c_id in order[:first_backward_idx]:
+        layer_range = get_layer_range(c_id)
+        new_order += layer_range
+        chunk_id_list.extend([abs(c_id) - 1, i] for i in range(len(layer_range)))
+
+    # 1f1b overlap stage
+    if first_backward_idx < last_forward_idx:
+        for c_id_b, c_id_f in zip(
+            order[first_backward_idx : last_forward_idx + 1 : 2],
+            order[first_backward_idx + 1 : last_forward_idx + 1 : 2],
+        ):
+            layer_range_f = get_layer_range(c_id_f)
+            layer_range_b = get_layer_range(c_id_b)
+            index = 0
+            for l_b, l_f in zip_longest(layer_range_b, layer_range_f, fillvalue=0):
+                # always forward graph before backward graph
+                if l_f != 0:
+                    add_order(c_id_f, l_f, index=index)
+                if l_b != 0:
+                    add_order(c_id_b, l_b)
+                    if capture_wgrad_graph and index < len(layer_range_b) - 1:
+                        add_order(c_id_b, l_b, is_wgrad=True)
+                index += 1
+            # last wgrad backward
+            if capture_wgrad_graph and layer_range_b:
+                add_order(c_id_b, layer_range_b[-1], is_wgrad=True)
+
+    # cool down stage, backward graphs only
+    for c_id in order[last_forward_idx + 1 :]:
+        for l_b in get_layer_range(c_id):
+            add_order(c_id, l_b)
+            if capture_wgrad_graph:
+                add_order(c_id, l_b, is_wgrad=True)
+
+    return new_order, chunk_id_list
+
+
 def forward_backward_pipelining_with_interleaving(
     *,
     forward_step_func,
@@ -967,6 +1085,9 @@ def forward_backward_pipelining_with_interleaving(
         adjust_tensor_shapes_fn is None
     ), "adjust_tensor_shapes_fn is not supported for interleaved pipeline parallelism"
 
+    if getattr(config, "moe_paged_stash", False):
+        paged_stash_reset(enabled=not forward_only, config=config)
+
     if config.overlap_p2p_comm and config.batch_p2p_comm:
         raise ValueError("Can not use both overlap_p2p_comm and batch_p2p_comm")
 
@@ -1065,7 +1186,15 @@ def enable_grad_sync():
 
     model_type = get_model_type(model[0])
 
-    tensor_shape = [seq_length, micro_batch_size, config.hidden_size]
+    # Determine hidden dimension for P2P communication
+    # For hyper connections with multiple PP stages, use n-stream dimension
+    hidden_dim = config.hidden_size
+    if getattr(config, 'enable_hyper_connections', False) and pipeline_parallel_size > 1:
+        # For interleaved PP with hyper connections, all intermediate communications use n-stream
+        # Note: This is a simplified approach - proper VPP support may need more complex logic
+        hidden_dim = config.hidden_size * getattr(config, 'num_residual_streams', 1)
+
+    tensor_shape = [seq_length, micro_batch_size, hidden_dim]
     tensor_shape[0] = tensor_shape[0] // cp_group.size()
     if config.sequence_parallel:
         tensor_shape[0] = tensor_shape[0] // tp_group.size()
@@ -2008,9 +2137,19 @@ def get_tensor_shapes(
     config,
     tp_group: Optional[torch.distributed.ProcessGroup] = None,
     cp_group: Optional[torch.distributed.ProcessGroup] = None,
+    pp_group: Optional[torch.distributed.ProcessGroup] = None,
+    is_recv: bool = True,
 ):
     """Determine tensor shapes for pipeline communication.
 
+    For hyper connections (mHC), intermediate pipeline stages communicate n-stream tensors
+    with dimension hidden_size * num_residual_streams.
+
+    Args:
+        is_recv: If True, compute shape for receiving; if False, for sending.
+                 This matters for hyper connections where first/last stages have different
+                 send/recv dimensions.
+
     Returns [()] for variable_seq_lengths mode (shapes exchanged dynamically),
     or computed shapes for fixed sequence length mode.
     """
@@ -2028,7 +2167,27 @@ def get_tensor_shapes(
     if config.sequence_parallel:
         effective_seq_length = effective_seq_length // tp_group.size()
 
-    tensor_shapes.append((effective_seq_length, micro_batch_size, config.hidden_size))
+    # Determine hidden dimension based on hyper connections and pipeline stage
+    hidden_size = config.hidden_size
+    # TODO: make this more robust, including flexible VPP layout
+    if getattr(config, 'enable_hyper_connections', False) and pp_group is not None:
+        pp_rank = pp_group.rank()
+        pp_size = pp_group.size()
+        # For hyper connections:
+        # - recv: stages with rank > 0 receive n-stream (n*C) from previous stage
+        # - send: stages with rank < pp_size-1 send n-stream (n*C) to next stage
+        use_nstream = False
+        if is_recv and pp_rank > 0:
+            # Receiving from previous stage (which sends n*C)
+            use_nstream = True
+        elif not is_recv and pp_rank < pp_size - 1:
+            # Sending to next stage (send n*C)
+            use_nstream = True
+
+        if use_nstream:
+            hidden_size = hidden_size * getattr(config, 'num_residual_streams', 1)
+
+    tensor_shapes.append((effective_seq_length, micro_batch_size, hidden_size))
     return tensor_shapes
 
 
@@ -2141,6 +2300,9 @@ def forward_backward_pipelining_without_interleaving(
     if config.timers is not None:
         config.timers('forward-backward', log_level=1).start(barrier=config.barrier_with_L1_time)
 
+    if getattr(config, "moe_paged_stash", False):
+        paged_stash_reset(enabled=not forward_only, config=config)
+
     # Disable async grad reductions
     no_sync_func = config.no_sync_func
     if no_sync_func is None:
@@ -2196,6 +2358,8 @@ def enable_grad_sync():
         config=config,
         tp_group=tp_group,
         cp_group=cp_group,
+        pp_group=getattr(p2p_communicator, "pp_group", None),
+        is_recv=True,
     )
     send_tensor_shapes = get_tensor_shapes(
         seq_length=seq_length,
@@ -2204,6 +2368,8 @@ def enable_grad_sync():
         config=config,
         tp_group=tp_group,
         cp_group=cp_group,
+        pp_group=getattr(p2p_communicator, "pp_group", None),
+        is_recv=False,
     )
     if adjust_tensor_shapes_fn is not None:
         recv_tensor_shapes, send_tensor_shapes = adjust_tensor_shapes_fn(
diff --git a/megatron/core/pipeline_parallel/utils.py b/megatron/core/pipeline_parallel/utils.py
index f316806ead7..0593693501c 100644
--- a/megatron/core/pipeline_parallel/utils.py
+++ b/megatron/core/pipeline_parallel/utils.py
@@ -182,8 +182,6 @@ def __init__(
         self.free_input = free_input
         self.inputs = None
         self.outputs = None
-        self.delay_grads_release = False
-        self.manual_release_grads = False
 
     def default_backward_func(self, outputs, output_grad):
         """Default backward function"""
@@ -263,12 +261,6 @@ def _backward(self, *output_grad):
             for g in output_grad:
                 if g is not None:
                     g.record_stream(self.stream)
-                    # Manually trigger the memory release of dgrad tensor
-                    # to avoid delayed garbage collection. If
-                    # delay_grads_release is True, dgrad is last used in
-                    # wgrad compute and skip the release here.
-                    if self.manual_release_grads and not self.delay_grads_release:
-                        g.untyped_storage().resize_(0)
 
         grads = self.get_grad()
         self._release_state()
@@ -341,14 +333,18 @@ def run(
 _COMM_STREAM = None
 
 
-def set_streams(comm_stream=None):
+def set_streams(comm_stream=None, high_priority=False):
     """Set the stream for communication operations."""
     global _COMM_STREAM
 
     # Set communication stream
     if _COMM_STREAM is None:
         if comm_stream is None:
-            comm_stream = torch.cuda.Stream(device="cuda")
+            if high_priority:
+                _, high = torch.cuda.Stream.priority_range()
+                comm_stream = torch.cuda.Stream(device="cuda", priority=high)
+            else:
+                comm_stream = torch.cuda.Stream(device="cuda")
         _COMM_STREAM = comm_stream
 
 
diff --git a/megatron/core/ssm/gated_delta_net.py b/megatron/core/ssm/gated_delta_net.py
index 57315ec48d9..8df4df1e562 100644
--- a/megatron/core/ssm/gated_delta_net.py
+++ b/megatron/core/ssm/gated_delta_net.py
@@ -52,6 +52,7 @@
 
     HAVE_FLA = False
 
+
 logger = logging.getLogger(__name__)
 
 
@@ -84,6 +85,7 @@ def __init__(
         use_qk_l2norm: bool = True,
         A_init_range: Tuple[float, float] = (1, 16),
         pg_collection: ProcessGroupCollection = None,
+        **kwargs,
     ):
         """
         Args:
@@ -297,9 +299,37 @@ def forward(
             # TODO: support inference
             raise NotImplementedError("GDN does not support inference for now.")
 
-        if packed_seq_params is not None:
-            # TODO: support packed sequence
-            raise NotImplementedError("GDN does not support packed sequence for now.")
+        if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
+            assert batch == 1, "Packed sequence expects batch dimension to be 1"
+            assert (
+                not self.config.deterministic_mode
+            ), "Packed sequence does not support deterministic mode."
+
+            # Resolve cu_seqlens with alignment padding handling.
+            cu_seqlens_q = self._resolve_cu_seqlens(
+                packed_seq_params.cu_seqlens_q_padded,
+                packed_seq_params.cu_seqlens_q,
+                seq_len,
+                "cu_seqlens_q",
+            )
+            cu_seqlens_kv = self._resolve_cu_seqlens(
+                packed_seq_params.cu_seqlens_kv_padded,
+                packed_seq_params.cu_seqlens_kv,
+                seq_len,
+                "cu_seqlens_kv",
+            )
+            assert torch.equal(cu_seqlens_q, cu_seqlens_kv), (
+                "Currently only support cu_seqlens_q equals to cu_seqlens_kv, "
+                f"but got {cu_seqlens_q=} and {cu_seqlens_kv=}"
+            )
+            num_packed_seqs = cu_seqlens_q.shape[0] - 1
+            assert num_packed_seqs > 0, (
+                "Number of packed sequences must be greater than 0, "
+                f"but got {cu_seqlens_q=} and {cu_seqlens_kv=}"
+            )
+        else:
+            cu_seqlens_q = None
+            cu_seqlens_kv = None
 
         # Input projection
         nvtx_range_push(suffix="in_proj")
@@ -307,20 +337,41 @@ def forward(
         nvtx_range_pop(suffix="in_proj")
 
         # CP All to All: CP to HP
-        qkvzba = tensor_a2a_cp2hp(
-            qkvzba,
-            seq_dim=0,
-            head_dim=-1,
-            cp_group=self.pg_collection.cp,
-            split_sections=[
-                self.qk_dim_local_tp,
-                self.qk_dim_local_tp,
-                self.v_dim_local_tp,
-                self.v_dim_local_tp,
-                self.num_value_heads // self.tp_size,
-                self.num_value_heads // self.tp_size,
-            ],
-        )
+        if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
+            unpacked_qkvzba = _unpack_sequence(qkvzba, cu_seqlens_q // self.cp_size, dim=0)
+            outputs = []
+            for qkvzba_i in unpacked_qkvzba:
+                qkvzba_i = tensor_a2a_cp2hp(
+                    qkvzba_i,
+                    seq_dim=0,
+                    head_dim=-1,
+                    cp_group=self.pg_collection.cp,
+                    split_sections=[
+                        self.qk_dim_local_tp,
+                        self.qk_dim_local_tp,
+                        self.v_dim_local_tp,
+                        self.v_dim_local_tp,
+                        self.num_value_heads // self.tp_size,
+                        self.num_value_heads // self.tp_size,
+                    ],
+                )
+                outputs.append(qkvzba_i)
+            qkvzba = torch.cat(outputs, dim=0)
+        else:
+            qkvzba = tensor_a2a_cp2hp(
+                qkvzba,
+                seq_dim=0,
+                head_dim=-1,
+                cp_group=self.pg_collection.cp,
+                split_sections=[
+                    self.qk_dim_local_tp,
+                    self.qk_dim_local_tp,
+                    self.v_dim_local_tp,
+                    self.v_dim_local_tp,
+                    self.num_value_heads // self.tp_size,
+                    self.num_value_heads // self.tp_size,
+                ],
+            )
 
         # Transpose: s b x --> b s x
         # From sbhd to bshd format
@@ -387,6 +438,7 @@ def forward(
                 activation=self.activation,
                 initial_state=None,
                 output_final_state=False,
+                cu_seqlens=cu_seqlens_q,
             )
         nvtx_range_pop(suffix="conv1d")
 
@@ -416,6 +468,7 @@ def forward(
             initial_state=None,
             output_final_state=False,
             use_qk_l2norm_in_kernel=False,
+            cu_seqlens=cu_seqlens_q,
         )
         nvtx_range_pop(suffix="gated_delta_rule")
 
@@ -430,9 +483,19 @@ def forward(
         norm_out = norm_out.transpose(0, 1).contiguous()
 
         # CP all to all: HP to CP
-        norm_out = tensor_a2a_hp2cp(
-            norm_out, seq_dim=0, head_dim=-1, cp_group=self.pg_collection.cp
-        )
+        if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
+            unpacked_norm_out = _unpack_sequence(norm_out, cu_seqlens_q, dim=0)
+            outputs = []
+            for norm_out_i in unpacked_norm_out:
+                norm_out_i = tensor_a2a_hp2cp(
+                    norm_out_i, seq_dim=0, head_dim=-1, cp_group=self.pg_collection.cp
+                )
+                outputs.append(norm_out_i)
+            norm_out = torch.cat(outputs, dim=0)
+        else:
+            norm_out = tensor_a2a_hp2cp(
+                norm_out, seq_dim=0, head_dim=-1, cp_group=self.pg_collection.cp
+            )
 
         # Output projection
         nvtx_range_push(suffix="out_proj")
@@ -504,6 +567,23 @@ def _compute_g_and_beta(self, A_log_local_cp, dt_bias_local_cp, alpha, beta):
         beta = beta.sigmoid()
         return g, beta
 
+    def _resolve_cu_seqlens(self, cu_seqlens_padded, cu_seqlens_actual, total_seq_len, name):
+        """Resolve cu_seqlens for packed sequence all-to-all, handling alignment padding."""
+        if cu_seqlens_padded is not None:
+            cu_seqlens = cu_seqlens_padded
+        else:
+            cu_seqlens = cu_seqlens_actual
+
+        total_cu = cu_seqlens[-1].item()
+        if total_cu != total_seq_len:
+            raise ValueError(
+                f"GDN: {name}[-1]={total_cu} does not match "
+                f"total_sequence_length={total_seq_len}. "
+                f"({cu_seqlens_padded=}, {cu_seqlens_actual=})."
+            )
+
+        return cu_seqlens
+
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None, tp_group=None):
         """Provide a sharded state dictionary for distributed checkpointing."""
         # Guard for cases metadata is not provided
@@ -602,6 +682,17 @@ def _backward_out_proj(self):
         self.out_proj.backward_dw()
 
 
+def _unpack_sequence(x, cu_seqlens, dim=1):
+    unpacked_x = []
+    num_seqs = cu_seqlens.shape[0] - 1
+    for i in range(num_seqs):
+        idx_start = cu_seqlens[i].item()
+        idx_end = cu_seqlens[i + 1].item()
+        chunked_index = [slice(None)] * dim + [slice(idx_start, idx_end)]
+        unpacked_x.append(x[tuple(chunked_index)])
+    return unpacked_x
+
+
 ####################
 # Sharded state dict utilities
 ####################
@@ -808,13 +899,13 @@ def tensor_a2a_hp2cp(
         return tensor
 
     # Limitations of mamba_context_parallel._all_to_all_hp2cp.
-    assert seq_dim == 0, f"tensor_a2a_hp2cp only supports seq_dim == 0 for now, but got {seq_dim=}"
+    assert seq_dim == 0, f"tensor_a2a_cp2hp only supports seq_dim == 0 for now, but got {seq_dim=}"
     assert (
         head_dim == -1 or head_dim == 2
-    ), f"tensor_a2a_hp2cp only supports head_dim == -1 or 2 for now, but got {head_dim=}"
+    ), f"tensor_a2a_cp2hp only supports head_dim == -1 or 2 for now, but got {head_dim=}"
     assert (
         tensor.dim() == 3
-    ), f"tensor_a2a_hp2cp only supports 3-d input tensor for now, but got {tensor.dim()=}"
+    ), f"tensor_a2a_cp2hp only supports 3-d input tensor for now, but got {tensor.dim()=}"
 
     # Redo attention load balancing first if needed.
     if redo_attention_load_balancing:
@@ -853,6 +944,7 @@ def torch_chunk_gated_delta_rule(
     initial_state=None,
     output_final_state=False,
     use_qk_l2norm_in_kernel=False,
+    cu_seqlens=None,
 ):
     # pylint: disable=line-too-long
     '''
@@ -862,6 +954,10 @@ def torch_chunk_gated_delta_rule(
     Reference: https://github.com/huggingface/transformers/blob/144c8ce2809a2e21914017652700e1ecb450501e/src/transformers/models/qwen3_next/modeling_qwen3_next.py#L470-L547
     '''
 
+    assert (
+        cu_seqlens is None
+    ), "cu_seqlens is not supported for torch_chunk_gated_delta_rule for now."
+
     initial_dtype = query.dtype
     if use_qk_l2norm_in_kernel:
         query = l2norm(query, dim=-1, eps=1e-6)
diff --git a/megatron/core/ssm/mamba_context_parallel.py b/megatron/core/ssm/mamba_context_parallel.py
index 3925f8bd8df..0bfd12a9db1 100644
--- a/megatron/core/ssm/mamba_context_parallel.py
+++ b/megatron/core/ssm/mamba_context_parallel.py
@@ -85,6 +85,9 @@ def __init__(
         self.D_cp1 = D_cp1
         self.D_has_hdim = D_has_hdim
 
+        self._set_cp_params()
+
+    def _set_cp_params(self) -> None:
         self.cp_size = self.cp_group.size()
 
         if self.cp_size == 1:
@@ -130,6 +133,11 @@ def __init__(
         # and also `nheads_local_tpcp = nheads_local_tp // cp_size` whilst ngroups_local_tpcp is
         # either 1 or `ngroups_local_tp // cp_size`
 
+    def set_context_parallel_group(self, cp_group: torch.distributed.ProcessGroup):
+        """Set the context parallel group."""
+        self.cp_group = cp_group
+        self._set_cp_params()
+
     def pre_conv_ssm(
         self, input_: torch.Tensor, packed_seq_params: Optional[PackedSeqParams] = None
     ) -> torch.Tensor:
diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py
index 4368681b585..e9ee2dd8deb 100644
--- a/megatron/core/ssm/mamba_mixer.py
+++ b/megatron/core/ssm/mamba_mixer.py
@@ -320,18 +320,26 @@ def __init__(
         self.act = nn.SiLU()
 
         with get_cuda_rng_tracker().fork():
-            # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
-            dt = torch.exp(
-                torch.rand(
+            if self.config.perform_initialization:
+                # Initialize dt bias so that F.softplus(dt_bias) is between dt_min and dt_max
+                dt = torch.exp(
+                    torch.rand(
+                        self.nheads_local_tp,
+                        device=torch.cuda.current_device(),
+                        dtype=config.params_dtype,
+                    )
+                    * (math.log(dt_max) - math.log(dt_min))
+                    + math.log(dt_min)
+                ).clamp(min=dt_init_floor)
+                # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+                inv_dt = dt + torch.log(-torch.expm1(-dt))
+            else:
+                inv_dt = torch.empty(
                     self.nheads_local_tp,
                     device=torch.cuda.current_device(),
                     dtype=config.params_dtype,
                 )
-                * (math.log(dt_max) - math.log(dt_min))
-                + math.log(dt_min)
-            ).clamp(min=dt_init_floor)
-            # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
-            inv_dt = dt + torch.log(-torch.expm1(-dt))
+
             self.dt_bias = nn.Parameter(inv_dt)
             setattr(self.dt_bias, "tensor_model_parallel", True)
             setattr(self.dt_bias, "partition_dim", 0)
@@ -435,6 +443,11 @@ def forward(
                     out, out_bias = self._decode(hidden_states, conv_state, ssm_state)
                     return out, out_bias
 
+        # Dynamic CP group support
+        _orig_cp_group = self.cp.cp_group
+        if packed_seq_params is not None and packed_seq_params.cp_group is not None:
+            self.cp.set_context_parallel_group(packed_seq_params.cp_group)
+
         zxBCdt, _ = self.in_proj(hidden_states)
 
         zxBCdt = self.cp.pre_conv_ssm(zxBCdt, packed_seq_params)
@@ -452,6 +465,7 @@ def forward(
 
         out, out_bias = self.out_proj(y)
 
+        self.cp.set_context_parallel_group(_orig_cp_group)
         return out, out_bias
 
     def _dynamic_inference(self, hidden_states: torch.Tensor, context: DynamicInferenceContext):
diff --git a/megatron/core/tensor_parallel/random.py b/megatron/core/tensor_parallel/random.py
index 92d39ba92ef..4516fe10d88 100644
--- a/megatron/core/tensor_parallel/random.py
+++ b/megatron/core/tensor_parallel/random.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 # Parts of the code here are adapted from PyTorch
 # repo: https://github.com/pytorch/pytorch
@@ -598,7 +598,9 @@ def forward(
     @staticmethod
     def backward(ctx, *args):
         """Backward pass."""
-        if not torch.autograd._is_checkpoint_valid():
+        from megatron.core.transformer.cuda_graphs import is_graph_capturing
+
+        if not torch.autograd._is_checkpoint_valid() and not is_graph_capturing():
             raise RuntimeError(
                 "Checkpointing is not compatible with .grad(), "
                 "please use .backward() if possible"
@@ -642,10 +644,67 @@ def checkpoint(
     return CheckpointFunction.apply(function, distribute_saved_activations, *args)
 
 
+def _save_args_to_ctx(ctx, args):
+    """Save mixed tensor/non-tensor arguments into autograd ctx.
+
+    Since save_for_backward only supports tensors, this function separates
+    tensor and non-tensor arguments, saving tensors via save_for_backward
+    and storing non-tensor metadata (indices and values) as ctx attributes.
+
+    Use _load_args_from_ctx to reconstruct the original args.
+    """
+    tensor_args = []
+    non_tensor_entries = []
+
+    for index, arg in enumerate(args):
+        if isinstance(arg, torch.Tensor):
+            tensor_args.append(arg)
+            continue
+        non_tensor_entries.append((index, arg))
+
+    ctx.save_for_backward(*detach_variable(tuple(tensor_args)))
+    ctx._non_tensor_entries = tuple(non_tensor_entries)
+    ctx._total_args_count = len(args)
+
+
+def _load_args_from_ctx(ctx):
+    """Load and reconstruct mixed tensor/non-tensor arguments from autograd ctx.
+
+    This is the inverse of _save_args_to_ctx. It retrieves tensors from
+    ctx.saved_tensors and merges them with stored non-tensor arguments
+    to reconstruct the original args in their original order.
+
+    Returns:
+        tuple of reconstructed arguments in their original order.
+    """
+
+    def _detach_with_grad(tensor):
+        detached = tensor.detach()
+        detached.requires_grad_(tensor.requires_grad)
+        return detached
+
+    tensor_iter = iter(_detach_with_grad(t) for t in ctx.saved_tensors)
+    total_args_count = ctx._total_args_count
+    non_tensor_map = dict(ctx._non_tensor_entries)
+
+    reconstructed_args = []
+    for index in range(total_args_count):
+        if index in non_tensor_map:
+            reconstructed_args.append(non_tensor_map[index])
+        else:
+            reconstructed_args.append(next(tensor_iter))
+    return tuple(reconstructed_args)
+
+
 class CheckpointWithoutOutputFunction(torch.autograd.Function):
     """
     Checkpoint Function Helper for CheckpointWithoutOutput.
     Save context for recompute.
+
+    Handles both tensor and non-tensor arguments:
+    - Tensor arguments are saved via save_for_backward
+    - Non-tensor arguments (int, float, bool, None, etc.) are stored separately
+      in ctx attributes and reconstructed during recomputation
     """
 
     @staticmethod
@@ -668,7 +727,10 @@ def forward(
 
         with torch.no_grad(), fwd_ctx:
             outputs = run_function(*args)
-        ctx.save_for_backward(*detach_variable(args))
+
+        # Save tensor and non-tensor arguments into ctx for recomputation
+        _save_args_to_ctx(ctx, args)
+
         # the CheckpointWithoutOutput object is passed in, then it can access the saved input
         # tensors later for recomputation
         checkpoint_without_output_obj.ctx = ctx
@@ -685,10 +747,56 @@ def backward(ctx, *args):
         torch.autograd.backward(outputs, args)
         ctx.outputs = None
         ctx.inputs = None
-        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else inp for inp in inputs)
+        grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else None for inp in inputs)
         return (None, None) + grads
 
 
+class CheckpointManager:
+    """
+    Manages multiple CheckpointWithoutOutput objects within a TransformerBlock
+    cross layer recomputations, enabling unified recomputation during backward pass.
+    This is particularly useful for scenarios where multiple checkpoint operations have
+    sequential dependencies (i.e., the output of one checkpoint is the input of the next).
+
+    Usage:
+        ckptManager = CheckpointManager()
+        ckpt_function = CheckpointWithoutOutput(ckpt_manager=ckptManager)
+        ckpt_function.checkpoint(run_function, *args)
+        # other checkpointed operations
+        ckpt_manager.discard_all_outputs_and_register_unified_recompute(final_output)
+    """
+
+    def __init__(self):
+        self.checkpoints = []
+        # Set by TransformerBlock before each layer forward.
+        # When True, the layer should keep block-boundary output uncheckpointed.
+        self.is_last_layer_in_recompute_block = False
+
+    def add_checkpoint(self, ckpt):
+        """Add a checkpoint to the manager."""
+        if not isinstance(ckpt, CheckpointWithoutOutput):
+            raise TypeError("Expected CheckpointWithoutOutput object")
+        if ckpt.outputs is None:
+            raise ValueError("CheckpointWithoutOutput must call checkpoint() before adding")
+        self.checkpoints.append(ckpt)
+
+    def discard_all_outputs_and_register_unified_recompute(self, hook_tensor):
+        """Discard all checkpoint outputs to save memory and register unified recompute hook."""
+        for ckpt in self.checkpoints:
+            for output in ckpt.outputs:
+                output.untyped_storage().resize_(0)
+
+        # Register unified recompute hook
+        if hook_tensor.requires_grad:
+            hook_tensor.register_hook(self._unified_recompute_hook)
+
+    def _unified_recompute_hook(self, grad_output):
+        for ckpt in self.checkpoints:
+            # Call _recompute for each checkpoint in forward order
+            # The _recompute method will restore the output tensor storage
+            ckpt._recompute(None)
+
+
 class CheckpointWithoutOutput(object):
     """
     Checkpoint a model or part of the model and release the output.
@@ -703,8 +811,19 @@ class CheckpointWithoutOutput(object):
     discarded output tensors are directly saved in the following modules for backward computation.
     """
 
-    def __init__(self, fp8=False):
-        self.fp8 = fp8 is not None
+    def __init__(self, fp8=False, ckpt_manager=None):
+        """
+        Initialize CheckpointWithoutOutput.
+
+        Args:
+            fp8: Whether to use FP8 mode. Defaults to False.
+            ckpt_manager: Optional CheckpointManager instance. When provided,
+                         checkpoint() will auto-register to the manager, and
+                         discard_output_and_register_recompute() will only discard
+                         output without registering individual hooks.
+        """
+        self.fp8 = bool(fp8)
+        self.ckpt_manager = ckpt_manager
         self.run_function = None
         self.fwd_cpu_rng_state = None
         self.fwd_cuda_rng_state = None
@@ -713,7 +832,12 @@ def __init__(self, fp8=False):
         self.outputs = None
 
     def checkpoint(self, run_function: Callable[[Unpack[_Ts]], _R], *args: Unpack[_Ts]) -> _R:
-        """Checkpoint function."""
+        """
+        Checkpoint function.
+
+        If ckpt_manager was provided during initialization, this checkpoint
+        will be automatically registered to the manager after execution.
+        """
 
         # If in cuda graph warmup, disable checkpointing, as 'discard_output_and_register_recompute'
         # may be called in a separate graph warmup.
@@ -730,6 +854,11 @@ def checkpoint(self, run_function: Callable[[Unpack[_Ts]], _R], *args: Unpack[_T
         self.outputs = outputs
         if isinstance(self.outputs, torch.Tensor):
             self.outputs = (self.outputs,)
+
+        # Auto-register to manager if provided
+        if self.ckpt_manager is not None:
+            self.ckpt_manager.add_checkpoint(self)
+
         return outputs
 
     def _recompute(self, _):
@@ -738,7 +867,7 @@ def _recompute(self, _):
         from megatron.core.transformer.cuda_graphs import is_graph_capturing, is_graph_warmup
 
         # The recomputation has been triggered already. Just return.
-        # Handle cudagraphs, do nothing if currently in graph warmup
+        # Handle cudagraphs: do nothing if currently in graph warmup
         if self.ctx is None or is_graph_warmup():
             return
 
@@ -760,17 +889,8 @@ def _recompute(self, _):
                 recompute_ctx = contextlib.nullcontext()
                 fp8_ctx = contextlib.nullcontext()
 
-            # Store the inputs for backward pass
-            inputs = self.ctx.saved_tensors
-
-            def detach(t):
-                if isinstance(t, torch.Tensor):
-                    requires_grad = t.requires_grad
-                    t = t.detach()
-                    t.requires_grad_(requires_grad)
-                return t
-
-            inputs = tuple(detach(t) for t in inputs)
+            # Reconstruct full args list from saved ctx
+            inputs = _load_args_from_ctx(self.ctx)
             with torch.enable_grad(), fp8_ctx, recompute_ctx:
                 outputs = self.run_function(*inputs)
 
@@ -803,10 +923,11 @@ def discard_output_and_register_recompute(self, hook_tensor):
         in the forward pass and the gradient of the hook_tensor is computed before the recomputed
         tensors are used.
         """
-
+        # When ckpt_manager is set, this is a no-op.
+        # Manager handles all discarding and hook registration uniformly.
         from megatron.core.transformer.cuda_graphs import is_graph_warmup
 
-        if is_graph_warmup():
+        if self.ckpt_manager is not None or is_graph_warmup():
             return
 
         # use resize to release the output tensor memory and still keep the metadata in the tensors.
diff --git a/megatron/core/transformer/__init__.py b/megatron/core/transformer/__init__.py
index 0e3cdcfa57e..75e3b485c4f 100644
--- a/megatron/core/transformer/__init__.py
+++ b/megatron/core/transformer/__init__.py
@@ -1,6 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 from .module import MegatronModule
 from .spec_utils import ModuleSpec, build_module
 from .transformer_config import MLATransformerConfig, TransformerConfig
-from .transformer_layer import TransformerLayer, TransformerLayerSubmodules
+from .transformer_layer import (
+    HyperConnectionTransformerLayer,
+    TransformerLayer,
+    TransformerLayerSubmodules,
+)
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
index f89259be442..1db2f886c6d 100644
--- a/megatron/core/transformer/attention.py
+++ b/megatron/core/transformer/attention.py
@@ -258,12 +258,14 @@ def __init__(
         cp_comm_type: str | None = None,
         pg_collection: ProcessGroupCollection | None = None,
         pp_layer_offset: Optional[int] = None,
+        is_mtp_layer: bool = False,
     ):
         super().__init__(config=config)
 
         self.config = config
         self.layer_number = layer_number
         self._pp_layer_offset = pp_layer_offset
+        self.is_mtp_layer = is_mtp_layer
 
         self.attn_mask_type = attn_mask_type
         self.attention_type = attention_type
@@ -315,6 +317,7 @@ def __init__(
         self.key_hidden_size = self.hidden_size_per_attention_head
         self.val_hidden_size = self.hidden_size_per_attention_head
 
+        # TODO: This is built twice when using MLA, should be refactored.
         if self.config.num_query_groups < world_size:
             # TE throws an assertion error if num_kv_heads / num_query_groups
             # is not divisible by TP size.
@@ -386,6 +389,71 @@ def __init__(
             # the quantized tensor.
             set_save_original_input(self.linear_proj)
 
+        # Per-layer RotaryEmbedding (used when rotary_base_per_layer is set in config).
+        self.rotary_pos_emb = None
+        if getattr(self.config, 'rotary_base_per_layer', None):
+            rotary_base = self.config.rotary_base_per_layer[self.layer_number - 1]
+            self._build_per_layer_rotary_pos_emb(rotary_base)
+
+    def _build_per_layer_rotary_pos_emb(self, rotary_base: float) -> None:
+        """Build self.rotary_pos_emb using a layer-specific rotary base."""
+        from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+
+        seq_len_interpolation_factor = self.config.rotary_scaling_factor
+        if self.config.position_embedding_type == 'rope' and not self.config.multi_latent_attention:
+            self.rotary_pos_emb = RotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=self.config.rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                rotary_base=rotary_base,
+                rope_scaling=self.config.rope_scaling,
+                rope_scaling_factor=self.config.rope_scaling_factor,
+                use_cpu_initialization=self.config.use_cpu_initialization,
+                cp_group=self.pg_collection.cp,
+            )
+        elif self.config.position_embedding_type == 'yarn':
+            self.rotary_pos_emb = YarnRotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=self.config.rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                rotary_base=rotary_base,
+                scaling_factor=getattr(self.config, "yarn_rotary_scaling_factor"),
+                original_max_position_embeddings=getattr(
+                    self.config, "yarn_original_max_position_embeddings"
+                ),
+                beta_fast=getattr(self.config, "yarn_beta_fast"),
+                beta_slow=getattr(self.config, "yarn_beta_slow"),
+                mscale=getattr(self.config, "yarn_mscale"),
+                mscale_all_dim=getattr(self.config, "yarn_mscale_all_dim"),
+                correction_range_round_to_int=getattr(
+                    self.config, "yarn_correction_range_round_to_int"
+                ),
+                use_cpu_initialization=self.config.use_cpu_initialization,
+            )
+        elif (
+            self.config.position_embedding_type == 'mrope'
+            and not self.config.multi_latent_attention
+        ):
+            self.rotary_pos_emb = MultimodalRotaryEmbedding(
+                kv_channels=self.config.kv_channels,
+                rotary_percent=self.config.rotary_percent,
+                rotary_interleaved=self.config.rotary_interleaved,
+                seq_len_interpolation_factor=seq_len_interpolation_factor,
+                rotary_base=rotary_base,
+            )
+            self.mrope_section = self.config.mrope_section
+            assert (
+                self.mrope_section is not None
+            ), "mrope require mrope_section setting, but we got None from TransformerConfig"
+        else:
+            raise NotImplementedError(
+                f"rotary_base_per_layer does not support "
+                f"position_embedding_type={self.config.position_embedding_type!r} "
+                f"(only 'rope' / 'yarn' / 'mrope' are supported)."
+            )
+
     def _checkpointed_attention_forward(
         self,
         query,
@@ -1008,6 +1076,13 @@ def forward(
             (Tuple[Tensor, Tensor]) Attention output and bias.
 
         """
+
+        # here we need to set the right cp group for dynamic-cp
+        _orig_cp_group = self.pg_collection.cp
+        if packed_seq_params is not None and packed_seq_params.local_cp_size is not None:
+            assert packed_seq_params.cp_group is not None, "cp_group must be set in dynamic-cp mode"
+            self.pg_collection.cp = packed_seq_params.cp_group
+
         # Check if we need to skip RoPE
         # no_rope is 0-indexed array and self.layer_number is 1-indexed
         no_rope = (
@@ -1016,6 +1091,11 @@ def forward(
         if no_rope:
             rotary_pos_emb = None
 
+        # Per-layer theta: override the model-level RoPE with this layer's own embedding.
+        if self.rotary_pos_emb is not None and rotary_pos_emb is not None:
+            seq_len = rotary_pos_emb.shape[0]
+            rotary_pos_emb = self.rotary_pos_emb(seq_len)
+
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
         if inference_context and inference_context.is_dynamic_batching():
@@ -1066,25 +1146,26 @@ def forward(
                 self.k_layernorm is None or isinstance(self.k_layernorm, IdentityOp),
             ]
         )
+        output_gate = self.config.attention_output_gate
         # Check if fused_single_qkv_rope is requested but either unavailable or not
         # supported for the current use case.
         if self.attention_type != "cross":
             assert not (
                 self.config.fused_single_qkv_rope and split_qkv
             ), "fused_single_qkv_rope requested but not available/supported for the config."
+        if output_gate:
+            assert split_qkv, "output_gate is not supported for unsplit mixed_qkv tensor."
 
-        with off_interface(self.offload_qkv_linear, hidden_states, "qkv_linear") as hidden_states:
+        qkv_linear_manager = off_interface(self.offload_qkv_linear, hidden_states, "qkv_linear")
+        with qkv_linear_manager as hidden_states:
             qkv_output = self.get_query_key_value_tensors(
                 hidden_states,
                 key_value_states,
                 split_qkv=split_qkv,
                 output_gate=self.config.attention_output_gate,
             )
-        if self.offload_qkv_linear:
-            # `qkv_output` may be a tuple; commit supports tuple/list and will keep structure.
-            qkv_output = off_interface.group_commit(
-                qkv_output, name="qkv_linear", forced_released_tensors=[]
-            )
+        # `qkv_output` may be a tuple; commit supports tuple/list and will keep structure.
+        qkv_output = qkv_linear_manager.group_offload(qkv_output, forced_released_tensors=[])
         attn_mask_type = self.attn_mask_type
         block_table = None
         gate = None
@@ -1134,6 +1215,7 @@ def forward(
             out = output.transpose(0, 1).contiguous()
             context_layer = out.view(out.size(0), out.size(1), -1)
             output, bias = self.linear_proj(context_layer)
+            self.pg_collection.cp = _orig_cp_group
             return output, bias
 
         if (
@@ -1227,6 +1309,9 @@ def forward(
         # ==================================
 
         nvtx_range_push(suffix="core_attention")
+        core_attn_manager = off_interface(
+            self.offload_core_attention and self.training, query, "core_attn"
+        )
         if self.checkpoint_core_attention and self.training:
             core_attn_out = self._checkpointed_attention_forward(
                 query,
@@ -1240,9 +1325,7 @@ def forward(
         else:
             if inference_context is None or inference_context.is_static_batching():
                 # Static batching attention kernel.
-                with off_interface(
-                    self.offload_core_attention and self.training, query, "core_attn"
-                ) as query:
+                with core_attn_manager as query:
                     core_attn_out = apply_module(self.core_attention)(
                         query,
                         key,
@@ -1278,10 +1361,10 @@ def forward(
                 if is_using_quantization_scales(self.config):
                     core_attn_out[inference_context.padding_slice] = 0.0
 
-            if self.offload_core_attention and self.training:
-                core_attn_out = off_interface.group_commit(
-                    core_attn_out, name="core_attn", forced_released_tensors=[query, key, value]
-                )
+            core_attn_out = core_attn_manager.group_offload(
+                core_attn_out, forced_released_tensors=[query, key, value]
+            )
+
         if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
             # reshape to same output shape as unpacked case
             # (t, np, hn) -> (t, b=1, h=np*hn)
@@ -1300,14 +1383,13 @@ def forward(
         # Output. [sq, b, h]
         # =================
         nvtx_range_push(suffix="linear_proj")
-        with off_interface(self.offload_attn_proj, core_attn_out, "attn_proj") as core_attn_out:
+        attn_proj_manager = off_interface(self.offload_attn_proj, core_attn_out, "attn_proj")
+        with attn_proj_manager as core_attn_out:
             output, bias = self.linear_proj(core_attn_out)
-        if self.offload_attn_proj:
-            output = off_interface.group_commit(
-                output, name="attn_proj", forced_released_tensors=[core_attn_out]
-            )
+        output = attn_proj_manager.group_offload(output, forced_released_tensors=[core_attn_out])
         nvtx_range_pop(suffix="linear_proj")
 
+        self.pg_collection.cp = _orig_cp_group
         return output, bias
 
     @jit_fuser
@@ -1347,6 +1429,7 @@ def __init__(
         cp_comm_type: str | None = None,
         pg_collection: ProcessGroupCollection | None = None,
         pp_layer_offset: Optional[int] = None,
+        is_mtp_layer: bool = False,
     ):
         super().__init__(
             config=config,
@@ -1357,6 +1440,7 @@ def __init__(
             cp_comm_type=cp_comm_type,
             pg_collection=pg_collection,
             pp_layer_offset=pp_layer_offset,
+            is_mtp_layer=is_mtp_layer,
         )
 
         self.linear_qkv_out_dim = self.query_projection_size + 2 * self.kv_projection_size
@@ -1756,6 +1840,7 @@ def __init__(
         attn_mask_type: AttnMaskType = AttnMaskType.padding,
         cp_comm_type: str | None = None,
         pg_collection: ProcessGroupCollection | None = None,
+        is_mtp_layer: bool = False,
     ):
         super().__init__(
             config=config,
@@ -1765,6 +1850,7 @@ def __init__(
             attention_type="cross",
             cp_comm_type=cp_comm_type,
             pg_collection=pg_collection,
+            is_mtp_layer=is_mtp_layer,
         )
 
         if self.config.num_query_groups != self.config.num_attention_heads:
@@ -1804,6 +1890,8 @@ def get_query_key_value_tensors(
         Derives `query` tensor from `hidden_states`, and `key`/`value` tensors
         from `key_value_states`.
         """
+        assert not output_gate, "Output gate is not supported in cross attention for now."
+
         assert split_qkv, "split_qkv must be True for CrossAttention"
         assert not output_gate, "Output gate is not supported in cross attention for now."
 
diff --git a/megatron/core/transformer/cuda_graphs.py b/megatron/core/transformer/cuda_graphs.py
index 067f6055015..48bd34ce499 100644
--- a/megatron/core/transformer/cuda_graphs.py
+++ b/megatron/core/transformer/cuda_graphs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import dataclasses
 import gc
@@ -821,13 +821,12 @@ def create_fwd_graph(self, args, kwargs, outputs=None, clone_inputs=True):
 
         is_moe = isinstance(self.base_module, MoETransformerLayer)
         if is_moe:
-            from megatron.core.transformer.moe.moe_utils import get_moe_layer_wise_logging_tracker
+            from megatron.core.transformer.moe.moe_logging import get_moe_metrics_tracker
 
-            tracker = get_moe_layer_wise_logging_tracker()
+            moe_metrics_tracker = get_moe_metrics_tracker()
             cached_aux_losses = {}
-            for name in tracker:
-                if "values" in tracker[name]:
-                    cached_aux_losses[name] = torch.clone(tracker[name]["values"])
+            for name, entry in moe_metrics_tracker.metrics.items():
+                cached_aux_losses[name] = entry.values.clone()
 
         self.fwd_graph = torch.cuda.CUDAGraph()
 
@@ -1017,8 +1016,11 @@ def clone_ten(ten):
                     param.main_grad.copy_(main_grad_copy)
 
         if is_moe:
-            for name in tracker:
-                tracker[name]["values"].copy_(cached_aux_losses[name])
+            for name, cached_values in cached_aux_losses.items():
+                assert (
+                    name in moe_metrics_tracker.metrics
+                ), "cached metrics must be found in the tracker."
+                moe_metrics_tracker.metrics[name].values.copy_(cached_values)
 
     def create_bwd_graph(self):
         """Create a bwd cudagraph for this runner. Should be called inside
@@ -2268,6 +2270,15 @@ def _get_fp8_enabled():
                     )
             else:
                 kwargs['fp8_enabled'] = False
+
+            from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+                FineGrainedActivationOffloadingInterface as off_interface,
+            )
+
+            # Disable and enable offloading before and after the warmup stage of cuda graph.
+            if self.config.fine_grained_activation_offloading:
+                kwargs['pre_warmup_hook'] = off_interface.disable_offload
+                kwargs['post_warmup_hook'] = off_interface.enable_offload
             return kwargs
 
         kwargs = get_make_graphed_callables_kwargs()
@@ -2315,8 +2326,27 @@ def _finish_capturing(self, start_time):
         )
         _set_capture_end()
 
+        from megatron.core.distributed.finalize_model_grads import reset_model_temporary_tensors
+        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+            FineGrainedActivationOffloadingInterface as off_interface,
+        )
+
+        if self.config.fine_grained_activation_offloading:
+            off_interface.reset()
+
+        torch.distributed.barrier()
+        for model_chunk in self.model:
+            model_chunk.zero_grad_buffer()
+        for optimizer in self.optimizers:
+            optimizer.zero_grad()
+        from megatron.core.transformer.moe.moe_logging import get_moe_metrics_tracker
+
+        get_moe_metrics_tracker().clear()
+        reset_model_temporary_tensors(self.config, self.model)
+
         torch.cuda.synchronize()
         self._reset_after_capture()
+
         if FREEZE_GC:
             gc.unfreeze()
         gc.collect()
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
index 69039e0bfd0..26622839c14 100644
--- a/megatron/core/transformer/dot_product_attention.py
+++ b/megatron/core/transformer/dot_product_attention.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 
 import math
@@ -12,6 +12,9 @@
 from megatron.core.fusions.fused_softmax import FusedScaleMaskSoftmax
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.dot_product_attention_context_parallel import (
+    AttentionFuncionWithContextParallel,
+)
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import TransformerConfig
@@ -54,9 +57,12 @@ def __init__(
 
         self.config: TransformerConfig = config
 
-        assert (
-            self.config.context_parallel_size == 1
-        ), "Context parallelism is only supported by TEDotProductAttention!"
+        if self.config.context_parallel_size > 1:
+            assert attention_dropout is None and self.config.attention_dropout == 0.0, (
+                f'DotProductAttention with context parallelism does not support attention dropout,'
+                f' but got {self.config.context_parallel_size=},'
+                f' {attention_dropout=}, and {self.config.attention_dropout=}.'
+            )
 
         self.layer_number = max(1, layer_number)
         self.attn_mask_type = attn_mask_type
@@ -174,6 +180,19 @@ def forward(
                 self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
             )
 
+        if self.config.context_parallel_size > 1:
+            output = AttentionFuncionWithContextParallel.apply(
+                query,
+                key,
+                value,
+                attention_mask,
+                self.config.attention_dropout,
+                self.softmax_scale,
+                parallel_state.get_context_parallel_group(),
+            )
+            output = output.view(query.shape[0], query.shape[1], self.hidden_size_per_partition)
+            return output
+
         # [b, np, sq, sk]
         output_size = (query.size(1), query.size(2), query.size(0), key.size(0))
 
diff --git a/megatron/core/transformer/dot_product_attention_context_parallel.py b/megatron/core/transformer/dot_product_attention_context_parallel.py
new file mode 100644
index 00000000000..aaf08d40ade
--- /dev/null
+++ b/megatron/core/transformer/dot_product_attention_context_parallel.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+# Some of this code was adopted from https://github.com/zhuzilin/ring-flash-attention/
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from torch.nn import functional as F
+
+try:
+    import einops
+
+    HAVE_EINOPS = True
+except ImportError:
+    HAVE_EINOPS = False
+
+
+@torch.no_grad
+def eager_attn_fwd(q, k, v, attn_bias, sinks, scale, dropout):
+    """Forward pass for eager attention"""
+
+    # Rearrange query, key, value to (b, h, s, d)
+    b, sq, h, d = q.shape
+    sk = k.shape[1]
+    _q = einops.rearrange(q, 'b s h d -> b h s d')
+    _k = einops.rearrange(k, 'b s h d -> b h d s')
+    _v = einops.rearrange(v, 'b s h d -> b h s d')
+
+    # Compute attention weights
+    attn_w = torch.matmul(_q, _k) * scale
+    attn_w = attn_w + attn_bias
+
+    # Add sinks to attention weights
+    if sinks is None:
+        logits = attn_w
+    else:
+        _sinks = sinks.reshape(1, h, 1, 1).expand(b, -1, sq, 1)
+        logits = torch.cat([attn_w, _sinks], dim=-1)
+
+    # Compute attention scores
+    probs = F.softmax(logits, dim=-1, dtype=logits.dtype)
+    if sinks is None:
+        attn_w = probs
+    else:
+        attn_w = probs[..., :-1]  # Drop the sink
+
+    # Compute attention output
+    attn_output = torch.matmul(attn_w, _v)
+    attn_output = einops.rearrange(attn_output, 'b h s d -> b s h d')
+    attn_output = attn_output.contiguous()
+
+    return attn_output, probs
+
+
+@torch.no_grad
+def eager_attn_bwd(q, k, v, attn_bias, sinks, scale, dropout, attn_output, probs, grad_output):
+    """Backward pass for eager attention"""
+
+    # Rearrange query, key, value to (b, h, s, d)
+    b, sq, h, d = q.shape
+    sk = k.shape[1]
+    _q_T = einops.rearrange(q, 'b s h d -> b h d s')
+    _k_T = einops.rearrange(k, 'b s h d -> b h s d')
+    _v_T = einops.rearrange(v, ' b s h d -> b h d s')
+
+    # Backward pass for score @ value
+    if sinks is None:
+        attn_w = probs
+    else:
+        attn_w = probs[..., :-1]  # Drop the sink
+    grad_output = einops.rearrange(grad_output, 'b s h d -> b h s d')
+    attn_w_T = einops.rearrange(attn_w, ' b h sq sk -> b h sk sq')
+    grad__v = torch.matmul(attn_w_T, grad_output)
+    grad_attn_w = torch.matmul(grad_output, _v_T)
+
+    # Backward pass for softmax
+    if sinks is None:
+        grad_probs = grad_attn_w
+    else:
+        dummy = torch.zeros((b, h, sq, 1), device=q.device, dtype=q.dtype)
+        grad_probs = torch.cat([grad_attn_w, dummy], dim=3)
+    del grad_attn_w
+    grad_logits = torch._softmax_backward_data(
+        grad_probs, probs, -1, probs.dtype
+    )  # [b, h, sq, sk+1]
+
+    # Backward pass for adding sinks
+    if sinks is None:
+        grad_sinks = None
+        grad_attn_w = grad_logits
+    else:
+        grad__sinks = grad_logits[:, :, :, -1]  # [b, h, sq]
+        grad_sinks = einops.rearrange(grad__sinks, 'b h s -> h (b s)').sum(-1)
+        grad_attn_w = grad_logits[:, :, :, :-1].contiguous()  # [b, h, sq, sk]
+
+    # Backward pass for q @ K^T
+    grad_attn_w *= scale
+    grad__q = torch.matmul(grad_attn_w, _k_T)
+    grad__k = torch.matmul(_q_T, grad_attn_w)
+
+    # Rearrange grads to (b, s, h, d)
+    grad_v = einops.rearrange(grad__v, 'b h s d -> b s h d')
+    grad_k = einops.rearrange(grad__k, 'b h d s -> b s h d')
+    grad_q = einops.rearrange(grad__q, 'b h s d -> b s h d')
+    return grad_q, grad_k, grad_v, grad_sinks
+
+
+class AllGatherComm:
+    """All gather communication with async operations"""
+
+    def __init__(self, group=None) -> None:
+        self.group = group
+        self.handles = []
+
+    def all_gather(self, output_tensor: torch.Tensor, input_tensor: torch.Tensor):
+        '''All gather the input tensor to the output tensor'''
+
+        if self.group is None:
+            output_tensor.copy_(input_tensor)
+        else:
+            handle = torch.distributed.all_gather_into_tensor(
+                output_tensor, input_tensor, group=self.group, async_op=True
+            )
+            self.handles.append(handle)
+
+    def wait(self):
+        '''Wait for all gather operations to complete'''
+
+        if self.group is not None:
+            for handle in self.handles:
+                handle.wait()
+            self.handles = []
+
+
+def to_zz_mask_attn_bias(attention_mask, cp_size, nheads, nheads_k, heads_k_stride, device, dtype):
+    '''Convert the attention mask to the attention bias'''
+
+    if cp_size == 1:
+        zz_mask = attention_mask
+    else:
+        chunked = attention_mask.chunk(dim=3, chunks=cp_size * 2)
+        zz_mask = [_x for _p in zip(chunked[:cp_size], reversed(chunked[cp_size:])) for _x in _p]
+        zz_mask = torch.cat(zz_mask, dim=3)
+    attn_bias = torch.zeros(zz_mask.shape, device=device, dtype=dtype)
+    attn_bias.masked_fill_(zz_mask, float('-inf'))
+    attn_bias = attn_bias.expand(-1, heads_k_stride * (nheads // nheads_k), -1, -1)
+    return attn_bias
+
+
+class AttentionFuncionWithContextParallel(torch.autograd.Function):
+    """Native attention function with context parallelism."""
+
+    @staticmethod
+    def forward(ctx, q, k, v, attention_mask, attention_dropout, softmax_scale, pg):
+        '''Forward pass for the native attention function with context parallelism'''
+
+        # Assert einops exists
+        if not HAVE_EINOPS:
+            raise ImportError("einops is required by the attention CP but cannot be imported.")
+
+        # Initialize communication group and constants
+        cp_size = 1
+        if pg is not None:
+            cp_size = torch.distributed.get_world_size(pg)
+        comm = AllGatherComm(group=pg)
+        nheads = q.shape[2]
+        nheads_k = k.shape[2]
+        heads_k_stride = 1
+        assert nheads % nheads_k == 0 and nheads_k % heads_k_stride == 0
+        outs = []
+        probs = []
+
+        # Initialize KV buffers
+        kv_buffer = torch.empty(
+            (2, k.shape[0] * cp_size, k.shape[1], heads_k_stride, k.shape[3]),
+            dtype=k.dtype,
+            device=k.device,
+        )
+        kv_buffer_copy = torch.empty_like(kv_buffer)
+
+        # All-gather first chunk of KV buffers
+        k_0 = k[:, :, :heads_k_stride].contiguous()
+        v_0 = v[:, :, :heads_k_stride].contiguous()
+        comm.all_gather(kv_buffer_copy[0], k_0)
+        comm.all_gather(kv_buffer_copy[1], v_0)
+
+        # Prepare attention bias
+        assert (
+            attention_mask is not None
+        ), "Attention mask is required for the native attention function with context parallelism"
+        attn_bias = to_zz_mask_attn_bias(
+            attention_mask, cp_size, nheads, nheads_k, heads_k_stride, q.device, q.dtype
+        )
+
+        # Iterate over heads
+        for i in range(0, nheads_k, heads_k_stride):
+            # Wait for previous all-gather to complete
+            comm.wait()
+            kv_buffer, kv_buffer_copy = kv_buffer_copy, kv_buffer
+            # All-gather the next portion of KV buffers if not the last iteration
+            if i < nheads_k - heads_k_stride:
+                kvsl = i + heads_k_stride
+                kvsr = kvsl + heads_k_stride
+                send_k = k[:, :, kvsl:kvsr].contiguous()
+                send_v = v[:, :, kvsl:kvsr].contiguous()
+                comm.all_gather(kv_buffer_copy[0], send_k)
+                comm.all_gather(kv_buffer_copy[1], send_v)
+
+            # Prepare query, key, value for attention
+            q_i = q[:, :, i * nheads // nheads_k : (i + heads_k_stride) * nheads // nheads_k]
+            k_i = kv_buffer[0]
+            v_i = kv_buffer[1]
+
+            # Rearrange query, key, value to (b, s, h, d)
+            q_i = einops.rearrange(q_i, 's b h d -> b s h d')
+            k_i = einops.rearrange(k_i, 's b h d -> b s h d')
+            v_i = einops.rearrange(v_i, 's b h d -> b s h d')
+
+            # Forward pass
+            out_i, probs_i = eager_attn_fwd(
+                q_i, k_i, v_i, attn_bias, None, softmax_scale, attention_dropout
+            )
+            outs.append(out_i)
+            probs.append(probs_i)
+
+        # Concatenate outputs and rearrange to (s, b, h, d)
+        out = torch.cat(outs, dim=2)
+        out = einops.rearrange(out, 'b s h d -> s b h d')
+
+        # Save contexts for backward pass
+        ctx.save_for_backward(q, k, v, attention_mask, *outs, *probs)
+        ctx.dropout = attention_dropout
+        ctx.scale = softmax_scale
+        ctx.heads_k_stride = heads_k_stride  # TODO make it configurable
+        ctx.pg = pg
+
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        '''Backward pass for the native attention function with context parallelism'''
+
+        # Initialize or resume constants and communication group
+        q, k, v, attention_mask, *rest = ctx.saved_tensors
+        nheads = q.shape[2]
+        nheads_k = k.shape[2]
+        heads_k_stride = ctx.heads_k_stride
+        assert nheads_k % heads_k_stride == 0
+        outs = rest[: nheads_k // heads_k_stride]
+        probs = rest[nheads_k // heads_k_stride :]
+        pg = ctx.pg
+        cp_size = 1
+        if pg is not None:
+            cp_size = torch.distributed.get_world_size(pg)
+        comm = AllGatherComm(group=pg)
+
+        # Initialize KV buffers
+        kv_buffer = torch.empty(
+            (2, k.shape[0] * cp_size, k.shape[1], heads_k_stride, k.shape[3]),
+            dtype=k.dtype,
+            device=k.device,
+        )
+        kv_buffer_copy = torch.empty_like(kv_buffer)
+
+        # All-gather first chunk of KV buffers
+        dq = []
+        dk = []
+        dv = []
+        k_0 = k[:, :, :heads_k_stride].contiguous()
+        v_0 = v[:, :, :heads_k_stride].contiguous()
+        comm.all_gather(kv_buffer_copy[0], k_0)
+        comm.all_gather(kv_buffer_copy[1], v_0)
+
+        # Prepare attention bias
+        attn_bias = to_zz_mask_attn_bias(
+            attention_mask, cp_size, nheads, nheads_k, heads_k_stride, q.device, q.dtype
+        )
+
+        # Iterate over heads
+        for i in range(0, nheads_k, heads_k_stride):
+            # Slice query and output for this iteration
+            q_slice = slice(i * nheads // nheads_k, (i + heads_k_stride) * nheads // nheads_k)
+            q_i = q[:, :, q_slice]
+            dout_i = dout[:, :, q_slice]
+
+            # Wait for previous all-gather to complete
+            comm.wait()
+            kv_buffer, kv_buffer_copy = kv_buffer_copy, kv_buffer
+
+            # All-gather the next portion of KV buffers if not the last iteration
+            if i < nheads_k - heads_k_stride:
+                kvsl = i + heads_k_stride
+                kvsr = kvsl + heads_k_stride
+                send_k = k[:, :, kvsl:kvsr].contiguous()
+                send_v = v[:, :, kvsl:kvsr].contiguous()
+                comm.all_gather(kv_buffer_copy[0], send_k)
+                comm.all_gather(kv_buffer_copy[1], send_v)
+
+            # Prepare key, value for attention
+            k_i = kv_buffer[0]
+            v_i = kv_buffer[1]
+
+            # Rearrange query, key, value to (b, s, h, d)
+            q_i = einops.rearrange(q_i, 's b h d -> b s h d')
+            k_i = einops.rearrange(k_i, 's b h d -> b s h d')
+            v_i = einops.rearrange(v_i, 's b h d -> b s h d')
+            dout_i = einops.rearrange(dout_i, 's b h d -> b s h d')
+
+            # Backward pass
+            dq_i, _dk_i, _dv_i, _ = eager_attn_bwd(
+                q_i, k_i, v_i, attn_bias, None, ctx.scale, ctx.dropout, outs[i], probs[i], dout_i
+            )
+
+            # Rearrange gradients to (s, b, h, d)
+            dq_i = einops.rearrange(dq_i, 'b s h d -> s b h d')
+            _dk_i = einops.rearrange(_dk_i, 'b s h d -> s b h d')
+            _dv_i = einops.rearrange(_dv_i, 'b s h d -> s b h d')
+            if pg is None:
+                dk_i = _dk_i
+                dv_i = _dv_i
+            else:
+                # Reduce-scatter gradients if CP > 1
+                dk_i = torch.zeros(
+                    (k_i.shape[1] // cp_size, k_i.shape[0], k_i.shape[2], k_i.shape[3]),
+                    device=k_i.device,
+                    dtype=k_i.dtype,
+                )
+                dv_i = torch.zeros(
+                    (v_i.shape[1] // cp_size, v_i.shape[0], v_i.shape[2], v_i.shape[3]),
+                    device=v_i.device,
+                    dtype=v_i.dtype,
+                )
+                torch.distributed.reduce_scatter_tensor(dk_i, _dk_i, group=pg)
+                torch.distributed.reduce_scatter_tensor(dv_i, _dv_i, group=pg)
+
+            # Collect gradients
+            dq.append(dq_i)
+            dk.append(dk_i)
+            dv.append(dv_i)
+
+        # Concatenate gradients and return
+        dq = torch.cat(dq, dim=2)
+        dk = torch.cat(dk, dim=2)
+        dv = torch.cat(dv, dim=2)
+        return dq, dk, dv, None, None, None, None
diff --git a/megatron/core/transformer/experimental_attention_variant/absorbed_mla.py b/megatron/core/transformer/experimental_attention_variant/absorbed_mla.py
index 860118b17a3..6c6d5b07a75 100644
--- a/megatron/core/transformer/experimental_attention_variant/absorbed_mla.py
+++ b/megatron/core/transformer/experimental_attention_variant/absorbed_mla.py
@@ -18,6 +18,7 @@
 
 import torch
 
+from megatron.core import tensor_parallel
 from megatron.core.extensions.transformer_engine import HAVE_TE
 from megatron.core.models.common.embeddings import (
     RotaryEmbedding,
@@ -112,6 +113,9 @@ def __init__(
         )
 
         assert not config.add_bias_linear, "add_bias_linear is not supported for AbsorbedMLA"
+        assert not (
+            config.tensor_model_parallel_size > 1 and not config.sequence_parallel
+        ), "AbsorbedMLA requires sequence_parallel when tensor_model_parallel_size > 1"
 
         self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads
         self.q_head_dim = self.config.qk_head_dim + self.config.qk_pos_emb_head_dim
@@ -594,6 +598,7 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
                     cu_seqlens=cu_seqlens_q,
                     mscale=mscale,
                     cp_group=self.pg_collection.cp,
+                    mla_rotary_interleaved=True,
                 )
                 # k_pos_emb:[num_tokens, 1, qk_pos_emb_head_dim]
                 k_pos_emb = apply_rotary_pos_emb(
@@ -603,6 +608,7 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
                     cu_seqlens=cu_seqlens_kv,
                     mscale=mscale,
                     cp_group=self.pg_collection.cp,
+                    mla_rotary_interleaved=True,
                 )
 
                 # query: [num_tokens, n, (kv_lora_rank + qk_pos_emb_head_dim)]
diff --git a/megatron/core/transformer/experimental_attention_variant/csa.py b/megatron/core/transformer/experimental_attention_variant/csa.py
new file mode 100644
index 00000000000..86f4cc78f63
--- /dev/null
+++ b/megatron/core/transformer/experimental_attention_variant/csa.py
@@ -0,0 +1,774 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+import copy
+from dataclasses import dataclass
+from functools import lru_cache
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from megatron.core.fusions.fused_mla_yarn_rope_apply import fused_mla_rope_inplace
+from megatron.core.models.common.embeddings import RotaryEmbedding, apply_rotary_pos_emb
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.experimental_attention_variant.dsa import (
+    DSAIndexerLossAutoScaler,
+    DSAIndexerLossLoggingHelper,
+    FusedDSAIndexerLoss,
+    fused_qk_topk_naive,
+    rotate_activation,
+)
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import nvtx_range_pop, nvtx_range_push
+
+# ---------------------------------------------------------------------------
+# Helper functions for index computation
+# ---------------------------------------------------------------------------
+
+
+# TODO: the lru_cache may not work well with packed sequence
+@lru_cache(maxsize=8)
+def _get_window_topk_idxs_cached(window_size: int, seqlen: int, device_str: str) -> torch.Tensor:
+    """Compute sliding-window indices for a single sequence (cached).
+
+    Returns:
+        indices: [seqlen, window_size] int tensor, -1 for invalid positions.
+    """
+    base = torch.arange(seqlen, device=device_str).unsqueeze(1)
+    offsets = torch.arange(window_size, device=device_str)
+    matrix = (base - window_size + 1).clamp(min=0) + offsets
+    matrix = torch.where(matrix > base, -1, matrix)
+    return matrix
+
+
+def get_window_topk_idxs(
+    window_size: int, batch_size: int, seqlen: int, device: torch.device
+) -> torch.Tensor:
+    """Sliding-window indices [batch, seqlen, window_size]."""
+    matrix = _get_window_topk_idxs_cached(window_size, seqlen, str(device))
+    return matrix.unsqueeze(0).expand(batch_size, -1, -1)
+
+
+# TODO: the lru_cache may not work well with packed sequence
+@lru_cache(maxsize=8)
+def _get_compress_topk_idxs_cached(
+    ratio: int, seqlen: int, offset: int, device_str: str
+) -> torch.Tensor:
+    """Compute all-compressed-positions indices for a single sequence (cached).
+
+    Returns:
+        indices: [seqlen, seqlen // ratio] int tensor, -1 for future positions.
+    """
+    n_compressed = seqlen // ratio
+    matrix = torch.arange(n_compressed, device=device_str).repeat(seqlen, 1)
+    mask = matrix >= torch.arange(1, seqlen + 1, device=device_str).unsqueeze(1) // ratio
+    matrix = torch.where(mask, -1, matrix + offset)
+    return matrix
+
+
+def get_compress_topk_idxs(
+    ratio: int, batch_size: int, seqlen: int, offset: int, device: torch.device
+) -> torch.Tensor:
+    """All-compressed-position indices [batch, seqlen, seqlen // ratio]."""
+    matrix = _get_compress_topk_idxs_cached(ratio, seqlen, offset, str(device))
+    return matrix.unsqueeze(0).expand(batch_size, -1, -1)
+
+
+# ---------------------------------------------------------------------------
+# Helper functions for RoPE
+# ---------------------------------------------------------------------------
+
+
+def _apply_rope(
+    x: torch.Tensor,
+    nope_dim: int,
+    pos_dim: int,
+    rotary_pos_emb_module: RotaryEmbedding,
+    config: TransformerConfig,
+    rotary_seq_len: int,
+    ratio: int = 1,
+    cp_group: torch.distributed.ProcessGroup = None,
+) -> torch.Tensor:
+    """Apply RoPE to the last ``qk_pos_emb_head_dim`` dims, leaving the rest unchanged.
+
+    Accepts both 3-D ``[seq, batch, head_dim]`` and 4-D ``[seq, batch, heads, head_dim]``
+    inputs.  When the input is 3-D a temporary head dimension is inserted for
+    ``apply_rotary_pos_emb`` and removed before returning.
+    """
+    if ratio == 1:
+        total_seq_len = rotary_seq_len
+    else:
+        total_seq_len = rotary_seq_len * ratio
+    mscale = 1.0
+    rotary_pos_cos = None
+    rotary_pos_sin = None
+    if config.rope_type == "rope":
+        rotary_pos_emb = rotary_pos_emb_module(total_seq_len, packed_seq=False)
+        mscale = 1.0
+    else:
+        if config.apply_rope_fusion:
+            rotary_pos_cos, rotary_pos_sin = rotary_pos_emb_module.get_cached_cos_sin(
+                total_seq_len, dtype=x.dtype, packed_seq=False
+            )
+            rotary_pos_emb = None
+            assert (
+                fused_mla_rope_inplace is not None
+            ), "Fused MLA RoPE apply is not imported successfully"
+        else:
+            rotary_pos_emb, mscale = rotary_pos_emb_module(total_seq_len, packed_seq=False)
+            # DSv4 reference (DS-Inf) RoPE is pure rotation (norm-preserving). Yarn's
+            # concentration factor (mscale) is NOT part of the DSv4 model contract --
+            # the model relies on Q/KV RMS-norm + unit-magnitude rotation. Force 1.0.
+            mscale = 1.0
+    if rotary_pos_emb is not None and ratio > 1:
+        rotary_pos_emb = rotary_pos_emb[:total_seq_len:ratio][:rotary_seq_len]
+    if rotary_pos_cos is not None and ratio > 1:
+        rotary_pos_cos = rotary_pos_cos[:total_seq_len:ratio][:rotary_seq_len]
+    if rotary_pos_sin is not None and ratio > 1:
+        rotary_pos_sin = rotary_pos_sin[:total_seq_len:ratio][:rotary_seq_len]
+
+    squeeze_head = x.dim() == 3
+    if squeeze_head:
+        x = x.unsqueeze(-2)
+    if config.apply_rope_fusion:
+        out = fused_mla_rope_inplace(
+            x,
+            rotary_pos_cos,
+            rotary_pos_sin,
+            nope_dim,
+            pos_dim,
+            None,
+            cp_group.rank(),
+            cp_group.size(),
+            remove_interleaving=True,
+        )
+    else:
+        x_nope, x_pe = torch.split(x, [nope_dim, pos_dim], dim=-1)
+        x_pe = apply_rotary_pos_emb(
+            x_pe,
+            rotary_pos_emb,
+            config=config,
+            cu_seqlens=None,
+            mscale=mscale,
+            cp_group=cp_group,
+            mla_rotary_interleaved=True,
+            mla_output_remove_interleaving=True,
+        )
+        out = torch.cat([x_nope, x_pe], dim=-1)
+    if squeeze_head:
+        out = out.squeeze(-2)
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Sparse attention kernel (unfused, differentiable)
+# ---------------------------------------------------------------------------
+
+
+def unfused_compressed_sparse_attn(
+    query: torch.Tensor,
+    kv_full: torch.Tensor,
+    attn_sink: torch.Tensor,
+    topk_indices: torch.Tensor,
+    softmax_scale: float,
+) -> torch.Tensor:
+    """Differentiable sparse attention with MQA and attention sink.
+
+    Args:
+        query:        [sq, b, np, hn]   multi-head query.
+        kv_full:      [n_kv, b, hn]     single-head KV (original + compressed).
+        attn_sink:    [np]              per-head learnable bias.
+        topk_indices: [b, sq, topk]     indices into kv_full (int32, -1 = invalid).
+        softmax_scale: float
+
+    Returns:
+        output:       [sq, b, np * hn]
+    """
+    sq, b, np_, hn = query.size()
+
+    # --- Gather KV at topk positions ---
+    # kv_full: [n_kv, b, hn] -> [b, n_kv, hn]
+    kv_t = kv_full.permute(1, 0, 2)
+
+    safe_indices = topk_indices.clamp(min=0).long()  # [b, sq, topk]
+    safe_indices_exp = safe_indices.unsqueeze(-1).expand(-1, -1, -1, hn)  # [b, sq, topk, hn]
+    # [b, n_kv, hn] -> [b, 1, n_kv, hn] -> gather -> [b, sq, topk, hn]
+    kv_gathered = torch.gather(
+        kv_t.unsqueeze(1).expand(-1, sq, -1, -1), dim=2, index=safe_indices_exp
+    )
+
+    # --- Attention scores ---
+    # query: [sq, b, np, hn] -> [b, np, sq, hn]
+    q = query.permute(1, 2, 0, 3).float()
+    kv_g = kv_gathered.float()  # [b, sq, topk, hn]
+
+    # [b, np, sq, topk]
+    scores = torch.einsum("bnsh,bskh->bnsk", q, kv_g) * softmax_scale
+
+    # Mask invalid
+    invalid_mask = (topk_indices < 0).unsqueeze(1)  # [b, 1, sq, topk]
+    scores = scores.masked_fill(invalid_mask, float("-inf"))
+
+    # --- Softmax with attention sink ---
+    sink = attn_sink.view(1, np_, 1, 1).float()
+    scores_max = scores.max(dim=-1, keepdim=True).values  # [b, np, sq, 1]
+    scores_max = torch.max(scores_max, sink)
+
+    exp_scores = torch.exp(scores - scores_max)  # [b, np, sq, topk]
+    exp_sink = torch.exp(sink - scores_max)  # [1, np, 1, 1]
+
+    sum_exp = exp_scores.sum(dim=-1, keepdim=True) + exp_sink
+    attn_weights = exp_scores / sum_exp  # [b, np, sq, topk]
+
+    # --- Weighted sum ---
+    output = torch.einsum("bnsk,bskh->bnsh", attn_weights, kv_g)
+    output = output.to(query.dtype)
+
+    # [b, np, sq, hn] -> [sq, b, np, hn] -> [sq, b, np * hn]
+    output = output.permute(2, 0, 1, 3).contiguous()
+    output = output.reshape(sq, b, np_ * hn)
+    return output
+
+
+# ---------------------------------------------------------------------------
+# Compressor
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CompressorSubmodules:
+    """Submodule specs for CSA and HCA Compressor."""
+
+    linear_wkv: Union[ModuleSpec, type] = None
+    linear_wgate: Union[ModuleSpec, type] = None
+    norm: Union[ModuleSpec, type] = None
+
+
+class Compressor(MegatronModule):
+    """Gated pooling compressor for CSA and HCA sparse attention.
+
+    Compresses a sequence of tokens into a shorter sequence by pooling groups of
+    ``compress_ratio`` tokens using learned gated weights.
+
+    For ``compress_ratio == 4``, overlapping compression is used (``coff = 2``).
+    For ``compress_ratio == 128``, non-overlapping compression is used (``coff = 1``).
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: CompressorSubmodules,
+        compress_ratio: int,
+        head_dim: int,
+        rotate: bool = False,
+        rotary_pos_emb: nn.Module = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+    ) -> None:
+        super().__init__(config=config)
+
+        if pg_collection is None:
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        self.pg_collection = pg_collection
+
+        self.compress_ratio = compress_ratio
+        self.head_dim = head_dim
+        self.overlap = compress_ratio == 4
+        self.coff = 1 + int(self.overlap)
+        self.rotate = rotate
+        self.qk_pos_emb_head_dim = config.qk_pos_emb_head_dim
+
+        self.rotary_pos_emb = rotary_pos_emb
+
+        proj_out_dim = self.coff * head_dim
+
+        self.linear_wkv = build_module(
+            submodules.linear_wkv,
+            config.hidden_size,
+            proj_out_dim,
+            config=config,
+            init_method=config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+        self.linear_wgate = build_module(
+            submodules.linear_wgate,
+            config.hidden_size,
+            proj_out_dim,
+            config=config,
+            init_method=config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+        # keep to high precision
+        _ape = torch.empty(
+            compress_ratio, proj_out_dim, device=torch.cuda.current_device(), dtype=torch.float32
+        )
+        config.init_method(_ape)
+        self.ape = nn.Parameter(_ape)
+
+        norm_config = copy.copy(config)
+        norm_config.normalization = "RMSNorm"
+        self.norm = build_module(
+            submodules.norm, config=norm_config, hidden_size=head_dim, eps=config.layernorm_epsilon
+        )
+
+    def _overlap_transform(self, tensor: torch.Tensor, fill_value: float = 0) -> torch.Tensor:
+        """Apply overlapping window transform for 4x compression.
+
+        Input shape:  [n_groups, ratio, b, coff * head_dim]
+        Output shape: [n_groups, 2 * ratio, b, head_dim]
+        """
+        n_groups, ratio, b_dim, _ = tensor.size()
+        d = self.head_dim
+        new_tensor = tensor.new_full((n_groups, 2 * ratio, b_dim, d), fill_value)
+        new_tensor[:, ratio:] = tensor[:, :, :, d:]
+        new_tensor[1:, :ratio] = tensor[:-1, :, :, :d]
+        return new_tensor
+
+    def forward(self, x: torch.Tensor) -> Optional[torch.Tensor]:
+        """Compress hidden states into shorter KV sequence.
+
+        Args:
+            x: [sq, b, hidden_size]
+
+        Returns:
+            compressed_kv [sq // ratio, b, head_dim] or None if too short.
+        """
+        nvtx_range_push("compressor")
+
+        sq, b, _ = x.size()
+        ratio = self.compress_ratio
+
+        if sq < ratio:
+            nvtx_range_pop("compressor")
+            return None
+
+        kv, _ = self.linear_wkv(x)  # [sq, b, coff * head_dim]
+        score, _ = self.linear_wgate(x)  # [sq, b, coff * head_dim]
+
+        cutoff = (sq // ratio) * ratio
+        if cutoff < sq:
+            kv = kv[:cutoff]
+            score = score[:cutoff]
+
+        n_compressed = cutoff // ratio
+
+        # Reshape: [n_compressed, ratio, b, coff * head_dim]
+        kv = kv.view(n_compressed, ratio, b, -1)
+        score = score.view(n_compressed, ratio, b, -1)
+
+        # APE: [ratio, coff * head_dim] -> [1, ratio, 1, coff * head_dim]
+        score = score + self.ape.view(1, ratio, 1, -1)
+
+        if self.overlap:
+            kv = self._overlap_transform(kv, fill_value=0)
+            score = self._overlap_transform(score, fill_value=float("-inf"))
+
+        kv = (kv * torch.softmax(score, dim=1)).sum(dim=1)  # [n_compressed, b, head_dim]
+
+        kv = self.norm(kv.to(x.dtype))
+
+        kv = _apply_rope(
+            kv,
+            self.head_dim - self.qk_pos_emb_head_dim,
+            self.qk_pos_emb_head_dim,
+            self.rotary_pos_emb,
+            self.config,
+            n_compressed,
+            ratio=ratio,
+            cp_group=self.pg_collection.cp,
+        )
+
+        if self.rotate:
+            kv = rotate_activation(kv)
+
+        nvtx_range_pop("compressor")
+        return kv  # [n_compressed, b, head_dim]
+
+
+# ---------------------------------------------------------------------------
+# CSAIndexer
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CSAIndexerSubmodules:
+    """Submodule specs for CSAIndexer."""
+
+    linear_wq_b: Union[ModuleSpec, type] = None
+    linear_weights_proj: Union[ModuleSpec, type] = None
+    compressor: Union[ModuleSpec, type] = None
+
+
+class CSAIndexer(MegatronModule):
+    """Learned top-k retrieval over compressed positions for CSA sparse attention.
+
+    Computes index scores to select the most relevant compressed KV positions for each
+    query.  Reuses the scoring logic from ``DSAIndexer`` (einsum -> relu -> weight -> sum
+    -> topk) and ``rotate_activation`` (Hadamard transform) from ``dsa.py``.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: CSAIndexerSubmodules,
+        compress_ratio: int,
+        rotary_pos_emb: nn.Module = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+    ) -> None:
+        super().__init__(config=config)
+
+        if pg_collection is None:
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        self.pg_collection = pg_collection
+
+        self.compress_ratio = compress_ratio
+        self.hidden_size = config.hidden_size
+        self.qk_pos_emb_head_dim = config.qk_pos_emb_head_dim
+        self.q_lora_rank = (
+            config.q_lora_rank if config.q_lora_rank is not None else config.hidden_size
+        )
+
+        self.index_n_heads = config.dsa_indexer_n_heads
+        self.index_head_dim = config.dsa_indexer_head_dim
+        self.index_topk = config.dsa_indexer_topk
+
+        self.softmax_scale: float = self.index_head_dim**-0.5
+
+        self.rotary_pos_emb = rotary_pos_emb
+
+        # Q projection
+        self.linear_wq_b = build_module(
+            submodules.linear_wq_b,
+            self.q_lora_rank,
+            self.index_n_heads * self.index_head_dim,
+            config=config,
+            init_method=config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+        # Weights projection
+        self.linear_weights_proj = build_module(
+            submodules.linear_weights_proj,
+            self.hidden_size,
+            self.index_n_heads,
+            config=config,
+            init_method=config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            skip_weight_param_allocation=False,
+            parallel_mode="duplicated",
+        )
+
+        # Own compressor (smaller head_dim, with Hadamard rotation)
+        self.compressor = build_module(
+            submodules.compressor,
+            config=config,
+            compress_ratio=compress_ratio,
+            head_dim=self.index_head_dim,
+            rotate=True,
+            rotary_pos_emb=rotary_pos_emb,
+            pg_collection=pg_collection,
+        )
+
+    def forward_before_topk(
+        self, x: torch.Tensor, qr: torch.Tensor, packed_seq_params: Optional[PackedSeqParams] = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute Q, compressed K, and weights before top-k selection."""
+        nvtx_range_push("indexer_before_topk")
+
+        sq, bsz, _ = x.size()
+
+        # Q path
+        q, _ = self.linear_wq_b(qr)  # [sq, b, n_heads * head_dim]
+        q = q.reshape(sq, bsz, self.index_n_heads, self.index_head_dim)
+        q = _apply_rope(
+            q,
+            self.index_head_dim - self.qk_pos_emb_head_dim,
+            self.qk_pos_emb_head_dim,
+            self.rotary_pos_emb,
+            self.config,
+            sq,
+            ratio=1,
+            cp_group=self.pg_collection.cp,
+        )
+        q = rotate_activation(q)
+
+        # K path: own compressor
+        k = self.compressor(x)  # [sq//ratio, b, index_head_dim]
+
+        weights, _ = self.linear_weights_proj(x)  # [sq, b, n_heads]
+        weights = weights * (self.index_n_heads**-0.5)
+
+        nvtx_range_pop("indexer_before_topk")
+        return q, k, weights
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        qr: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Return (index_scores, topk_indices)."""
+        nvtx_range_push("indexer")
+        assert packed_seq_params is None, "Packed sequence not supported for CSAIndexer"
+        q, k, weights = self.forward_before_topk(x, qr, packed_seq_params)
+        nvtx_range_push("indexer_qk_topk")
+        effective_topk = min(self.index_topk, k.size(0))
+        index_scores, topk_indices = fused_qk_topk_naive(q, k, weights, effective_topk, mask)
+        nvtx_range_pop("indexer_qk_topk")
+        nvtx_range_pop("indexer")
+        return index_scores, topk_indices
+
+
+# ---------------------------------------------------------------------------
+# CompressedSparseAttention (core attention)
+# ---------------------------------------------------------------------------
+
+
+@dataclass
+class CompressedSparseAttentionSubmodules:
+    """Submodule specs for CompressedSparseAttention."""
+
+    compressor: Union[ModuleSpec, type] = None
+    indexer: Union[ModuleSpec, type] = None
+
+
+class CompressedSparseAttention(MegatronModule):
+    """Sparse core attention for CompressedSparseAttention.
+
+    Combines sliding window attention with compressed KV attention.  The spec always
+    provides compressor and indexer submodule specs; this ``__init__`` inspects
+    ``config.csa_compress_ratios[layer_idx]`` and conditionally builds them:
+
+    * ``ratio == 0``:  window-only (compressor and indexer NOT built)
+    * ``ratio == 4``:  window + 4x compressed + learned Indexer (both built)
+    * ``ratio == 128``: window + 128x compressed, attend to all (compressor built only)
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: CompressedSparseAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        attention_dropout: Optional[float] = None,
+        softmax_scale: Optional[float] = None,
+        k_channels: Optional[int] = None,
+        v_channels: Optional[int] = None,
+        cp_comm_type: str = "p2p",
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        rotary_pos_emb: nn.Module = None,
+        compress_ratio: int = 0,
+    ):
+        super().__init__(config=config)
+
+        if pg_collection is None:
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+        self.pg_collection = pg_collection
+
+        self.layer_number = layer_number
+        self.compress_ratio = compress_ratio
+        self.window_size = config.csa_window_size
+        self.v_head_dim = config.v_head_dim
+
+        self.n_local_heads = config.num_attention_heads
+
+        if softmax_scale is None:
+            softmax_scale = config.v_head_dim**-0.5
+        self.softmax_scale = softmax_scale
+
+        self.force_unfused_dsa = getattr(config, 'force_unfused_dsa', True)
+
+        # Learnable attention sink per head
+        self.attn_sink = nn.Parameter(torch.zeros(self.n_local_heads, dtype=torch.float32))
+
+        # Conditionally build Compressor (ratio > 1)
+        if self.compress_ratio > 1 and submodules.compressor is not None:
+            self.compressor = build_module(
+                submodules.compressor,
+                config=config,
+                compress_ratio=self.compress_ratio,
+                head_dim=config.v_head_dim,
+                rotate=False,
+                rotary_pos_emb=rotary_pos_emb,
+                pg_collection=pg_collection,
+            )
+        else:
+            self.compressor = None
+
+        # Conditionally build Indexer (ratio == 4)
+        if (
+            self.compress_ratio == 4
+            and not config.csa_dense_mode
+            and submodules.indexer is not None
+        ):
+            self.indexer = build_module(
+                submodules.indexer,
+                config=config,
+                compress_ratio=self.compress_ratio,
+                rotary_pos_emb=rotary_pos_emb,
+                pg_collection=pg_collection,
+            )
+        else:
+            self.indexer = None
+
+    def forward(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: torch.Tensor,
+        x: torch.Tensor = None,
+        qr: torch.Tensor = None,
+        attn_mask_type: AttnMaskType = None,
+        attention_bias: torch.Tensor = None,
+        packed_seq_params: PackedSeqParams = None,
+    ) -> torch.Tensor:
+        """Forward pass for CompressedSparseAttention.
+
+        Args:
+            query:  [sq, b, np, v_head_dim]
+            key:    [sq, b, 1, v_head_dim]  (single-head MQA; head dim squeezed internally)
+            value:  unused (key == value in MQA)
+            attention_mask: attention mask (may be None for causal).
+            x:      [sq, b, hidden_size]  original hidden states.
+            qr:     [sq, b, q_lora_rank]  compressed query representation.
+
+        Returns:
+            output: [sq, b, np * v_head_dim]
+        """
+        nvtx_range_push("compressed_sparse_attn")
+        assert (
+            packed_seq_params is None
+        ), "Packed sequence not supported for CompressedSparseAttention"
+
+        sq, b, np, hn = query.size()
+
+        # --- Step 1: Prepare single-head KV (squeeze singleton head dim) ---
+        kv = key.squeeze(-2)  # [sq, b, 1, v_head_dim] -> [sq, b, v_head_dim]
+
+        # --- Step 2: Compression ---
+        if self.compressor is not None and self.compress_ratio > 1:
+            compressed_kv = self.compressor(x)  # [n_compressed, b, v_head_dim]
+            if compressed_kv is not None:
+                kv_full = torch.cat([kv, compressed_kv], dim=0)
+                n_compressed = compressed_kv.size(0)
+            else:
+                kv_full = kv
+                n_compressed = 0
+        else:
+            kv_full = kv
+            n_compressed = 0
+
+        offset = sq  # compressed indices start after original positions
+
+        # --- Step 3: Window indices ---
+        window_idxs = get_window_topk_idxs(self.window_size, b, sq, query.device)
+
+        # --- Step 4: Compressed indices ---
+        indexer_loss = None
+
+        if self.force_unfused_dsa:
+            if self.compress_ratio > 1 and n_compressed > 0:
+                nvtx_range_push("compressed_indices")
+                if self.indexer is not None:
+                    x_det = x.detach()
+                    qr_det = qr.detach()
+
+                    causal_mask = (
+                        torch.arange(n_compressed, device=x.device).unsqueeze(0).expand(sq, -1)
+                    )
+                    positions = torch.arange(1, sq + 1, device=x.device).unsqueeze(1)
+                    causal_mask = (
+                        torch.where(
+                            causal_mask >= positions // self.compress_ratio, float("-inf"), 0.0
+                        )
+                        .unsqueeze(0)
+                        .expand(b, -1, -1)
+                    )  # [b, sq, n_compressed]
+
+                    if self.training and torch.is_grad_enabled():
+                        q_indexer, k_indexer, weights_indexer = self.indexer.forward_before_topk(
+                            x_det, qr_det, packed_seq_params
+                        )
+                        indexer_loss_coeff = getattr(self.config, 'dsa_indexer_loss_coeff', 0.0)
+                        # compressed_kv is [n, b, hn]; expand to [n, b, np, hn] for loss
+                        key_for_loss = compressed_kv.unsqueeze(2).expand(-1, -1, np, -1)
+                        # ``FusedDSAIndexerLoss`` does not accept a separate
+                        # indexer_softmax_scale; apply it here via the
+                        # weights-scaling trick so the effective weights match
+                        # the pre-scale-split behaviour.
+                        weights_for_unfused = weights_indexer * self.indexer.softmax_scale
+                        topk_indices_compressed, indexer_loss = FusedDSAIndexerLoss.apply(
+                            q_indexer,
+                            weights_for_unfused,
+                            k_indexer,
+                            query.detach(),
+                            key_for_loss.detach(),
+                            self.softmax_scale,
+                            min(self.indexer.index_topk, n_compressed),
+                            indexer_loss_coeff,
+                            causal_mask,
+                            getattr(self.config, "dsa_indexer_use_sparse_loss", True),
+                            self.indexer.pg_collection,
+                        )
+                        if indexer_loss_coeff > 0:
+                            DSAIndexerLossLoggingHelper.save_loss_to_tracker(
+                                loss=indexer_loss,
+                                layer_number=self.layer_number,
+                                num_layers=self.config.num_layers,
+                            )
+                    else:
+                        _, topk_indices_compressed = self.indexer(
+                            x_det, qr_det, mask=causal_mask, packed_seq_params=packed_seq_params
+                        )
+
+                    n_valid_per_pos = positions // self.compress_ratio  # [sq, 1]
+                    valid = topk_indices_compressed < n_valid_per_pos
+                    compress_topk_idxs = torch.where(
+                        valid, topk_indices_compressed + offset, torch.tensor(-1, device=x.device)
+                    )
+                else:
+                    compress_topk_idxs = get_compress_topk_idxs(
+                        self.compress_ratio, b, sq, offset, query.device
+                    )
+
+                topk_idxs = torch.cat([window_idxs, compress_topk_idxs], dim=-1)
+                nvtx_range_pop("compressed_indices")
+            else:
+                topk_idxs = window_idxs
+
+            topk_idxs = topk_idxs.int()
+
+            # --- Step 5: Sparse attention ---
+            nvtx_range_push("sparse_attn_kernel")
+            output = unfused_compressed_sparse_attn(
+                query, kv_full, self.attn_sink.float(), topk_idxs, self.softmax_scale
+            )
+            nvtx_range_pop("sparse_attn_kernel")
+
+        else:
+            raise ValueError("Fused path is not supported for CompressedSparseAttention")
+
+        # --- Step 6: Attach indexer loss ---
+        if indexer_loss is not None and self.training and torch.is_grad_enabled():
+            output = DSAIndexerLossAutoScaler.apply(output, indexer_loss)
+
+        nvtx_range_pop("compressed_sparse_attn")
+        return output
diff --git a/megatron/core/transformer/experimental_attention_variant/deepseek_v4_hybrid_attention.py b/megatron/core/transformer/experimental_attention_variant/deepseek_v4_hybrid_attention.py
new file mode 100644
index 00000000000..ffb9ed33373
--- /dev/null
+++ b/megatron/core/transformer/experimental_attention_variant/deepseek_v4_hybrid_attention.py
@@ -0,0 +1,723 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+
+from dataclasses import dataclass
+from typing import NoReturn, Optional, Union
+
+import torch
+
+from megatron.core import tensor_parallel
+from megatron.core.extensions.transformer_engine import HAVE_TE
+from megatron.core.models.common.embeddings import (
+    RotaryEmbedding,
+    YarnRotaryEmbedding,
+    apply_rotary_pos_emb,
+)
+from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+    FineGrainedActivationOffloadingInterface as off_interface,
+)
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.transformer.attention import Attention
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.spec_utils import ModuleSpec, build_module
+from megatron.core.transformer.torch_norm import LayerNormBuilder
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+from megatron.core.typed_torch import apply_module
+from megatron.core.utils import get_pg_size, is_te_min_version
+
+try:
+    from megatron.core.fusions.fused_mla_yarn_rope_apply import fused_mla_rope_inplace
+except Exception:
+    fused_mla_rope_inplace = None
+
+
+if HAVE_TE:
+    from megatron.core.extensions.transformer_engine import TELinear, set_save_original_input
+else:
+    (TEColumnParallelLinear, TELinear, set_save_original_input) = (None, None, None)
+
+
+@torch.compile
+def _q_rms_norm(q: torch.Tensor, eps: float) -> torch.Tensor:
+    """Fused RMS normalization for query tensor (no learnable weight)."""
+    return q * torch.rsqrt(q.square().mean(-1, keepdim=True) + eps)
+
+
+@dataclass
+class DSv4HybridSelfAttentionSubmodules:
+    """Submodules for the DSv4HybridAttention layer."""
+
+    q_layernorm: LayerNormBuilder
+    kv_layernorm: LayerNormBuilder
+
+    linear_q_down_proj: Union[ModuleSpec, type] = None
+    linear_q_up_proj: Union[ModuleSpec, type] = None
+    linear_kv_proj: Union[ModuleSpec, type] = None
+    core_attention: Union[ModuleSpec, type] = None
+    linear_proj: Union[ModuleSpec, type] = None
+
+
+class DSv4HybridAttention(Attention):
+    """DeepSeek-v4 Hybrid Attention layer."""
+
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules: DSv4HybridSelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type: AttnMaskType,
+        attention_type: str,
+        cp_comm_type: Optional[str] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        is_mtp_layer: bool = False,
+    ) -> None:
+
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attention_type=attention_type,
+            attn_mask_type=attn_mask_type,
+            pg_collection=pg_collection,
+            is_mtp_layer=is_mtp_layer,
+        )
+        self.config: MLATransformerConfig
+
+        assert (
+            get_pg_size(self.pg_collection.tp) == 1
+        ), "DSv4 Hybrid Attention only supports TP size 1."
+
+        assert (
+            not self.checkpoint_core_attention
+        ), "Checkpoint core attention is not supported in DSv4 Hybrid Attention."
+        assert (
+            not self.offload_qkv_linear
+        ), "Offload qkv linear is not supported in DSv4 Hybrid Attention."
+
+        self.query_projection_size = self.config.v_head_dim * self.config.num_attention_heads
+
+        self.q_head_dim = self.config.v_head_dim
+
+        self.key_hidden_size = self.q_head_dim
+        self.val_hidden_size = self.config.v_head_dim
+
+        self.recompute_up_proj = (
+            self.config.recompute_granularity == 'selective'
+            and "mla_up_proj" in self.config.recompute_modules
+        )
+        self.qkv_up_checkpoint = None
+
+        self.softmax_scale = None
+
+        if is_mtp_layer:
+            layer_idx = self.config.num_layers + layer_number - 1
+            compress_ratio = self.config.csa_compress_ratios[layer_idx]
+        else:
+            compress_ratio = self.config.csa_compress_ratios[layer_number - 1]
+        rope_base = self.config.rotary_base
+        if compress_ratio > 1:
+            rope_base = self.config.csa_compress_rotary_base
+        if self.config.rope_type == "rope":
+            self.rotary_pos_emb = RotaryEmbedding(
+                self.config.qk_pos_emb_head_dim,
+                rotary_percent=self.config.rotary_percent,
+                rotary_base=rope_base,
+                cp_group=self.pg_collection.cp,
+            )
+        elif self.config.rope_type == "yarn":
+            self.rotary_pos_emb = YarnRotaryEmbedding(
+                self.config.qk_pos_emb_head_dim,
+                rotary_base=rope_base,
+                scaling_factor=self.config.rotary_scaling_factor,
+                original_max_position_embeddings=self.config.original_max_position_embeddings,
+                beta_fast=self.config.beta_fast,
+                beta_slow=self.config.beta_slow,
+                mscale=self.config.mscale,
+                mscale_all_dim=self.config.mscale_all_dim,
+                cp_group=self.pg_collection.cp,
+            )
+        else:
+            raise ValueError(
+                f"Unsupported RoPE type: {self.config.rope_type}, supported types are "
+                "'rope' and 'yarn'"
+            )
+
+        core_attn_extra_kwargs = {
+            "rotary_pos_emb": self.rotary_pos_emb,
+            "compress_ratio": compress_ratio,
+        }
+        self.core_attention = build_module(
+            submodules.core_attention,
+            config=self.config,
+            layer_number=self.layer_number,
+            attn_mask_type=self.attn_mask_type,
+            attention_type=self.attention_type,
+            softmax_scale=self.softmax_scale,
+            k_channels=self.q_head_dim,
+            v_channels=self.config.v_head_dim,
+            cp_comm_type=cp_comm_type,
+            pg_collection=self.pg_collection,
+            **core_attn_extra_kwargs,
+        )
+
+        # Output.
+        self.o_local_groups = self.config.o_groups
+        assert (
+            self.query_projection_size % self.config.o_groups == 0
+        ), "num_attention_heads * v_head_dim must be divisible by o_groups"
+        group_proj_in_size = self.query_projection_size // self.config.o_groups
+        group_proj_out_size = self.config.o_groups * self.config.o_lora_rank
+
+        _linear_o_group_proj = torch.empty(
+            group_proj_out_size,
+            group_proj_in_size,
+            device=torch.cuda.current_device(),
+            dtype=self.config.params_dtype,
+        )
+        self.config.init_method(_linear_o_group_proj)
+        self.linear_o_group_proj = torch.nn.Parameter(_linear_o_group_proj)
+
+        linear_proj_in_size = self.config.o_groups * self.config.o_lora_rank
+
+        self.linear_proj = build_module(
+            submodules.linear_proj,
+            linear_proj_in_size,
+            self.config.hidden_size,
+            config=self.config,
+            init_method=self.config.output_layer_init_method,
+            bias=self.config.add_bias_linear,
+            input_is_parallel=True,
+            skip_bias_add=True,
+            is_expert=False,
+            tp_comm_buffer_name='proj',
+            tp_group=self.pg_collection.tp,
+        )
+
+        if (
+            HAVE_TE
+            and isinstance(self.linear_proj, TELinear)
+            and (
+                (
+                    self.config.fp8
+                    and self.config.fp8_recipe != 'delayed'
+                    and is_te_min_version("2.6.0dev0")
+                )
+                or (self.config.fp4 and is_te_min_version("2.7.0.dev0"))
+            )
+        ):
+            # For fp8/fp4 training, the output of the fused core_attn is saved by itself, and
+            # linear_proj also saves the quantized tensor of this output. Here we set the
+            # linear_proj to save the original input tensors to avoid the extra memory usage of
+            # the quantized tensor.
+            set_save_original_input(self.linear_proj)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        key_value_states=None,
+        inference_context=None,
+        rotary_pos_emb=None,
+        rotary_pos_cos=None,
+        rotary_pos_sin=None,
+        rotary_pos_cos_sin=None,
+        attention_bias=None,
+        packed_seq_params=None,
+        position_ids=None,
+        sequence_len_offset=None,
+        *,
+        inference_params=None,
+    ):
+        """Forward pass for DeepSeek-v4 Hybrid Attention"""
+        assert (
+            rotary_pos_emb is None
+        ), "Rotary position embeddings should not be passed into DSv4HybridAttention."
+        assert (
+            attention_bias is None
+        ), "Attention bias should not be passed into DSv4HybridAttention."
+        assert (
+            rotary_pos_cos is None and rotary_pos_sin is None
+        ), "DSv4HybridAttention does not support Flash Decoding"
+        assert (
+            not rotary_pos_cos_sin
+        ), "Flash-infer rope has not been tested with DSv4HybridAttention."
+        assert (
+            inference_context is None and inference_params is None
+        ), "Inference is not supported for DSv4HybridAttention."
+
+        # =====================
+        # Query, Key, and Value
+        # =====================
+        # Get the query, key and value tensors based on the type of attention -
+        # self or cross attn.
+        query, key, value, q_compressed, kv_compressed = self.get_query_key_value_tensors(
+            hidden_states,
+            key_value_states,
+            position_ids,
+            packed_seq_params,
+            inference_context=inference_context,
+        )
+
+        # TODO: Currently, TE can only accept contiguous tensors for MLA
+        query = query.contiguous()
+        key = key.contiguous()
+        value = value.contiguous()
+
+        # ==================================
+        # core attention computation
+        # ==================================
+        # Need corresponding TE change
+        core_attn_manager = off_interface(
+            self.offload_core_attention and self.training, query, "core_attn"
+        )
+        with core_attn_manager as query:
+            core_attn_out = self.core_attention(
+                query,
+                key,
+                value,
+                attention_mask,
+                packed_seq_params=packed_seq_params,
+                x=hidden_states,
+                qr=q_compressed,
+            )
+        core_attn_out = core_attn_manager.group_offload(
+            core_attn_out, forced_released_tensors=[query, key, value]
+        )
+
+        if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
+            # reshape to same output shape as unpacked case
+            # (t, np, hn) -> (t, b=1, h=np*hn)
+            # t is the pack size = sum (sq_i)
+            # note that batch is a dummy dimension in the packed case
+            core_attn_out = core_attn_out.reshape(core_attn_out.size(0), 1, -1)
+
+        if self.recompute_up_proj:
+            assert self.qkv_up_checkpoint is not None
+            self.qkv_up_checkpoint.discard_output_and_register_recompute(core_attn_out)
+            self.qkv_up_checkpoint = None
+
+        # inverse RoPE on last qk_pos_emb_head_dim of each head
+        seq_len = core_attn_out.size(0)
+        n_heads = self.num_attention_heads_per_partition
+        pos_dim = self.config.qk_pos_emb_head_dim
+        nope_dim = self.config.v_head_dim - pos_dim
+        core_attn_out = core_attn_out.view(seq_len, core_attn_out.size(1), n_heads, -1)
+        packed_seq = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd'
+        if packed_seq:
+            cu_seqlens_kv = (
+                packed_seq_params.cu_seqlens_kv_padded
+                if packed_seq_params.cu_seqlens_kv_padded is not None
+                else packed_seq_params.cu_seqlens_kv
+            )
+            rope_seqlen = cu_seqlens_kv
+        else:
+            cu_seqlens_kv = None
+            rope_seqlen = seq_len
+        mscale = 1.0
+        rotary_pos_cos = None
+        rotary_pos_sin = None
+        if self.config.rope_type == "rope":
+            rotary_pos_emb = self.rotary_pos_emb(rope_seqlen, packed_seq=packed_seq)
+        else:
+            if self.config.apply_rope_fusion:
+                rotary_pos_cos, rotary_pos_sin = self.rotary_pos_emb.get_cached_cos_sin(
+                    rope_seqlen, dtype=hidden_states.dtype, packed_seq=packed_seq
+                )
+                rotary_pos_emb = None
+                assert inference_context is None, "Inference with MLA RoPE fusion is not supported"
+                assert (
+                    fused_mla_rope_inplace is not None
+                ), "Fused MLA RoPE apply is not imported successfully"
+            else:
+                rotary_pos_emb, mscale = self.rotary_pos_emb(rope_seqlen, packed_seq=packed_seq)
+                # DSv4 reference (DS-Inf) RoPE is pure rotation (norm-preserving). Yarn's
+                # concentration factor (mscale) is NOT part of the DSv4 model contract --
+                # the model relies on Q/KV RMS-norm + unit-magnitude rotation. Force 1.0.
+                mscale = 1.0
+        if self.config.apply_rope_fusion:
+            core_attn_out = fused_mla_rope_inplace(
+                core_attn_out,
+                rotary_pos_cos,
+                rotary_pos_sin,
+                nope_dim,
+                pos_dim,
+                cu_seqlens_kv,
+                self.pg_collection.cp.rank(),
+                self.pg_collection.cp.size(),
+                inverse=True,
+                remove_interleaving=True,
+            )
+        else:
+            content_part, rot_part = torch.split(
+                core_attn_out, [core_attn_out.size(-1) - pos_dim, pos_dim], dim=-1
+            )
+            rot_part = apply_rotary_pos_emb(
+                rot_part,
+                rotary_pos_emb,
+                self.config,
+                cu_seqlens=cu_seqlens_kv,
+                mscale=mscale,
+                cp_group=self.pg_collection.cp,
+                mla_rotary_interleaved=True,
+                inverse=True,
+                mla_output_remove_interleaving=True,
+            )
+            core_attn_out = torch.cat([content_part, rot_part], dim=-1)
+        core_attn_out = core_attn_out.view(seq_len, core_attn_out.size(1), -1)
+
+        # Grouped output
+        core_attn_out = core_attn_out.view(
+            core_attn_out.size(0), core_attn_out.size(1), self.o_local_groups, -1
+        )
+        wo_a_weight = self.linear_o_group_proj.view(
+            self.o_local_groups, self.config.o_lora_rank, -1
+        )
+        core_attn_out = torch.einsum("...gd,grd->...gr", core_attn_out, wo_a_weight)
+        core_attn_out = core_attn_out.reshape(*core_attn_out.shape[:-2], -1)
+
+        # =================
+        # Output. [sq, b, h]
+        # =================
+        attn_proj_manager = off_interface(self.offload_attn_proj, core_attn_out, "attn_proj")
+        with attn_proj_manager as core_attn_out:
+            output, bias = self.linear_proj(core_attn_out)
+        output = attn_proj_manager.group_offload(output, forced_released_tensors=[core_attn_out])
+
+        return output, bias
+
+
+class DSv4HybridSelfAttention(DSv4HybridAttention):
+    """DSv4Hybrid Self-attention layer class
+
+    Self-attention layer takes input with size [s, b, h]
+    and returns output of the same size.
+    """
+
+    def __init__(
+        self,
+        config: MLATransformerConfig,
+        submodules: DSv4HybridSelfAttentionSubmodules,
+        layer_number: int,
+        attn_mask_type=AttnMaskType.padding,
+        cp_comm_type: Optional[str] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        is_mtp_layer: bool = False,
+    ):
+        if pg_collection is None:
+            pg_collection = ProcessGroupCollection.use_mpu_process_groups()
+
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            attn_mask_type=attn_mask_type,
+            attention_type="self",
+            cp_comm_type=cp_comm_type,
+            pg_collection=pg_collection,
+            is_mtp_layer=is_mtp_layer,
+        )
+
+        q_down_proj_kwargs = {}
+        if submodules.linear_q_down_proj in [TELinear]:
+            q_down_proj_kwargs['parallel_mode'] = 'duplicated'
+        else:
+            raise ValueError(f"Unsupported linear_q_down_proj: {submodules.linear_q_down_proj}")
+
+        self.linear_q_down_proj = build_module(
+            submodules.linear_q_down_proj,
+            self.config.hidden_size,
+            self.config.q_lora_rank,
+            config=self.config,
+            init_method=self.config.init_method,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='q_down_proj',
+            skip_weight_param_allocation=False,
+            tp_group=None,
+            **q_down_proj_kwargs,
+        )
+
+        self.linear_q_up_proj = build_module(
+            submodules.linear_q_up_proj,
+            self.config.q_lora_rank,
+            self.config.num_attention_heads * self.q_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='q_up_proj',
+            tp_group=pg_collection.tp,
+        )
+
+        self.linear_kv_proj = build_module(
+            submodules.linear_kv_proj,
+            self.config.hidden_size,
+            self.config.v_head_dim,
+            config=self.config,
+            init_method=self.config.init_method,
+            gather_output=False,
+            bias=False,
+            skip_bias_add=False,
+            is_expert=False,
+            tp_comm_buffer_name='kv_up_proj',
+            tp_group=pg_collection.tp,
+        )
+        self.kv_layernorm = submodules.kv_layernorm(
+            hidden_size=self.config.v_head_dim,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+
+        self.q_layernorm = submodules.q_layernorm(
+            hidden_size=self.config.q_lora_rank,
+            config=self.config,
+            eps=self.config.layernorm_epsilon,
+        )
+
+    def get_query_key_value_tensors(
+        self,
+        hidden_states,
+        key_value_states=None,
+        position_ids=None,
+        packed_seq_params=None,
+        inference_context=None,
+        *,
+        inference_params=None,
+    ):
+        """
+        Derives `query`, `key` and `value` tensors from `hidden_states`.
+        """
+        # s = sequence length, b = batch size, h = hidden size, n = num attention heads
+        # Attention heads [s, b, n*h]
+        assert (
+            hidden_states.ndim == 3
+        ), f"hidden_states should be 3D, [s, b, n*h], got {hidden_states.ndim}D"
+        if packed_seq_params is not None:
+            assert (
+                packed_seq_params.local_cp_size is None
+            ), "dynamic_context_parallel is not supported with MLA yet and is planned for future. \
+            Please disable dynamic_context_parallel."
+
+        assert (
+            inference_context is None and inference_params is None
+        ), "Inference is not supported for DSv4HybridSelfAttention."
+
+        # =========================================
+        # Prepare RoPE and seqlen related params
+        # =========================================
+        rotary_seq_len = self.rotary_pos_emb.get_rotary_seq_len(
+            inference_context, None, hidden_states, self.config, packed_seq_params
+        )
+
+        # rotary_pos_emb:[s, b, 1, 64]
+        mscale = 1.0
+        rotary_pos_cos = None
+        rotary_pos_sin = None
+        packed_seq = packed_seq_params is not None and packed_seq_params.qkv_format == 'thd'
+        if self.config.rope_type == "rope":
+            rotary_pos_emb = self.rotary_pos_emb(rotary_seq_len, packed_seq=packed_seq)
+        else:
+            if self.config.apply_rope_fusion:
+                rotary_pos_cos, rotary_pos_sin = self.rotary_pos_emb.get_cached_cos_sin(
+                    rotary_seq_len, dtype=hidden_states.dtype, packed_seq=packed_seq
+                )
+                rotary_pos_emb = None
+                assert inference_context is None, "Inference with MLA RoPE fusion is not supported"
+                assert (
+                    fused_mla_rope_inplace is not None
+                ), "Fused MLA RoPE apply is not imported successfully"
+            else:
+                rotary_pos_emb, mscale = self.rotary_pos_emb(rotary_seq_len, packed_seq=packed_seq)
+                # DSv4 reference (DS-Inf) RoPE is pure rotation (norm-preserving). Yarn's
+                # concentration factor (mscale) is NOT part of the DSv4 model contract --
+                # the model relies on Q/KV RMS-norm + unit-magnitude rotation. Force 1.0.
+                mscale = 1.0
+
+        if packed_seq_params is not None and packed_seq_params.qkv_format == 'thd':
+            if packed_seq_params.cu_seqlens_q_padded is not None:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q_padded
+            else:
+                cu_seqlens_q = packed_seq_params.cu_seqlens_q
+            if packed_seq_params.cu_seqlens_kv_padded is not None:
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv_padded
+            else:
+                cu_seqlens_kv = packed_seq_params.cu_seqlens_kv
+        else:
+            cu_seqlens_q = cu_seqlens_kv = None
+
+        # =========================================
+        # QKV down projection and layernorm
+        # =========================================
+        # q_compressed: [s, b, q_lora_rank]
+        q_compressed, _ = self.linear_q_down_proj(hidden_states)
+
+        kv_compressed = hidden_states
+        k_pos_emb = None
+
+        if packed_seq_params is not None:
+            # If sequence packing, TE expect [t, h, d] shaped qkv input.
+            # In Megatron-Core, the qkv shape is [t, 1, h, d].
+            # So we need to reshape qkv from [t, 1, h, d] to [t, h, d].
+            q_compressed = q_compressed.squeeze(1)
+
+        # =========================================
+        # Apply norm
+        # =========================================
+
+        if self.config.q_lora_rank is not None:
+            # q_compressed: [num_tokens, q_lora_rank]
+            q_compressed = apply_module(self.q_layernorm)(q_compressed)
+
+        # =========================================
+        # QKV up projection and RoPE apply
+        # =========================================
+
+        def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_pos_emb):
+            """
+            Apply the up projection and RoPE to the query and key.
+            When sequence packing enabled, the input tensors adopt a packed shape of [t, ...];
+            otherwise, they maintain the unpacked shape [s, b, ...]. In subsequent code comments,
+            we uniformly use [num_tokens, ...] to denote [s, b, ...] or [t, ...] for two cases.
+            """
+            # q_compressed: [num_tokens, q_lora_rank]
+            # q: [num_tokens, n * (qk_head_dim + qk_pos_emb_head_dim)]
+            q, _ = self.linear_q_up_proj(q_compressed)
+
+            # q: [num_tokens, n, q_head_dim]
+            q = q.view(*q.size()[:-1], self.num_attention_heads_per_partition, self.q_head_dim)
+            q = _q_rms_norm(q, self.config.layernorm_epsilon)
+
+            kv, _ = self.linear_kv_proj(kv_compressed)
+            kv = self.kv_layernorm(kv)
+
+            # [num_tokens, qk_pos_emb_head_dim] -> [num_tokens, 1, qk_pos_emb_head_dim]
+            if k_pos_emb is not None:
+                k_pos_emb = torch.unsqueeze(k_pos_emb, -2)
+
+            if self.config.apply_rope_fusion:
+                cp_rank = self.pg_collection.cp.rank()
+                cp_size = self.pg_collection.cp.size()
+                query = fused_mla_rope_inplace(
+                    q,
+                    rotary_pos_cos,
+                    rotary_pos_sin,
+                    self.config.qk_head_dim,
+                    self.config.qk_pos_emb_head_dim,
+                    cu_seqlens_q,
+                    cp_rank,
+                    cp_size,
+                    remove_interleaving=True,
+                )
+                kv = kv.unsqueeze(-2)
+                kv = fused_mla_rope_inplace(
+                    kv,
+                    rotary_pos_cos,
+                    rotary_pos_sin,
+                    self.config.qk_head_dim,
+                    self.config.qk_pos_emb_head_dim,
+                    cu_seqlens_q,
+                    cp_rank,
+                    cp_size,
+                    remove_interleaving=True,
+                )
+                key = kv
+                value = kv
+            else:
+                q_len = q.size()[0]
+                if packed_seq_params is None or self.config.context_parallel_size == 1:
+                    # Shorten rotary_pos_emb to the sequence length when inference_params
+                    # is not provided. This makes sure we can run forward directly with
+                    # any sequence length. During training, the sequence length is always
+                    # the full rotary_pos_emb length, except for sequence packing + CP.
+                    # When sequence packing and context parallel are both enabled, the
+                    # position embedding will not split rotary_pos_emb, so it may exceed
+                    # the sequence length on this CP rank, but we need the full rotary_pos_emb
+                    # to cover the full sequence, so we do not shorten it here.
+                    rotary_pos_emb = rotary_pos_emb[0:q_len]
+
+                # q_no_pe: [num_tokens, n, qk_head_dim]
+                # q_pos_emb: [num_tokens, n, qk_pos_emb_head_dim]
+                q_no_pe, q_pos_emb = torch.split(
+                    q, [self.config.qk_head_dim, self.config.qk_pos_emb_head_dim], dim=-1
+                )
+
+                # RoPE and query (shared for wkv and latent)
+                # q_pos_emb: [num_tokens, n, qk_pos_emb_head_dim]
+                q_pos_emb = apply_rotary_pos_emb(
+                    q_pos_emb,
+                    rotary_pos_emb,
+                    config=self.config,
+                    cu_seqlens=cu_seqlens_q,
+                    mscale=mscale,
+                    cp_group=self.pg_collection.cp,
+                    mla_rotary_interleaved=True,
+                    mla_output_remove_interleaving=True,
+                )
+                # query: [num_tokens, n, (qk_head_dim + v_head_dim)]
+                query = torch.cat([q_no_pe, q_pos_emb], dim=-1)
+
+                pos_dim = self.config.qk_pos_emb_head_dim
+                kv_no_pe, k_pos_emb = torch.split(kv, [kv.size(-1) - pos_dim, pos_dim], dim=-1)
+
+                # k_pos_emb:[num_tokens, 1, qk_pos_emb_head_dim]
+                k_pos_emb = apply_rotary_pos_emb(
+                    k_pos_emb,
+                    rotary_pos_emb,
+                    config=self.config,
+                    cu_seqlens=cu_seqlens_kv,
+                    mscale=mscale,
+                    cp_group=self.pg_collection.cp,
+                    mla_rotary_interleaved=True,
+                    mla_output_remove_interleaving=True,
+                )
+
+                # Single head: key = value = [num_tokens, 1, v_head_dim]
+                kv = torch.cat([kv_no_pe, k_pos_emb], dim=-1).unsqueeze(-2)
+                key = kv
+                value = kv
+
+            query = query.contiguous()
+            key = key.contiguous()
+            value = value.contiguous()
+
+            return query, key, value
+
+        if self.recompute_up_proj:
+            quantization = self.config.fp8 or self.config.fp4
+            self.qkv_up_checkpoint = tensor_parallel.CheckpointWithoutOutput(fp8=quantization)
+            query, key, value = self.qkv_up_checkpoint.checkpoint(
+                qkv_up_proj_and_rope_apply, q_compressed, kv_compressed, k_pos_emb, rotary_pos_emb
+            )
+        else:
+            query, key, value = qkv_up_proj_and_rope_apply(
+                q_compressed, kv_compressed, k_pos_emb, rotary_pos_emb
+            )
+
+        return query, key, value, q_compressed, kv_compressed
+
+    def backward_dw(self) -> NoReturn:
+        """Execute weight gradient computation"""
+        self._backward_kv_proj()
+        self._backward_q_proj()
+        self._backward_output_proj()
+
+    def _backward_kv_proj(self):
+        """Computes weight gradients of KV projection layers"""
+        self.linear_kv_proj.backward_dw()
+
+    def _backward_q_proj(self):
+        """Computes weight gradients of Q projection layers"""
+        self.linear_q_down_proj.backward_dw()
+        self.linear_q_up_proj.backward_dw()
+
+    def _backward_output_proj(self):
+        """Computes weight gradients of output projection layer"""
+        self.linear_proj.backward_dw()
+
+    def set_for_recompute_input_layernorm(self):
+        """Set the attention layer for recompute input_layernorm. Only needed for fp8/fp4."""
+        set_save_original_input(self.linear_q_down_proj)
+        set_save_original_input(self.linear_kv_proj)
diff --git a/megatron/core/transformer/experimental_attention_variant/dsa.py b/megatron/core/transformer/experimental_attention_variant/dsa.py
index 3734db7043f..c503abfecd7 100644
--- a/megatron/core/transformer/experimental_attention_variant/dsa.py
+++ b/megatron/core/transformer/experimental_attention_variant/dsa.py
@@ -26,6 +26,21 @@
 except ImportError:
     hadamard_transform = None
 
+def _pytorch_hadamard_transform(x, scale=1.0):
+    n = x.shape[-1]
+    result = x.clone()
+    h = 1
+    while h < n:
+        result = result.view(*result.shape[:-1], -1, 2 * h)
+        a = result[..., :h].clone()
+        b = result[..., h:].clone()
+        result[..., :h] = a + b
+        result[..., h:] = a - b
+        result = result.view(*result.shape[:-2], -1)
+        h *= 2
+    return result * scale
+
+
 
 def rotate_activation(x: torch.Tensor) -> torch.Tensor:
     """Apply Hadamard rotation activation.
@@ -41,9 +56,10 @@ def rotate_activation(x: torch.Tensor) -> torch.Tensor:
     assert (
         x.dtype == torch.bfloat16
     ), f"rotate_activation only support bf16 input, but got {x.dtype}"
-    assert hadamard_transform is not None, "fast_hadamard_transform is not installed."
     hidden_size = x.size(-1)
-    return hadamard_transform(x, scale=hidden_size**-0.5)
+    if hadamard_transform is not None:
+        return hadamard_transform(x, scale=hidden_size**-0.5)
+    return _pytorch_hadamard_transform(x, scale=hidden_size**-0.5)
 
 
 class DSAIndexerLossLoggingHelper:
@@ -167,6 +183,7 @@ def compute_dsa_indexer_loss(
     loss_coeff: float,
     sparse_loss: bool,
     pg_collection: ProcessGroupCollection,
+    causal_mask_override: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     """
     Compute KL divergence loss between index_scores and true attention_scores.
@@ -203,29 +220,62 @@ def compute_dsa_indexer_loss(
     # Reshape to [b, np, sq, sk]
     attention_scores = attention_scores.reshape(b, np, sq, sk)
 
-    # causal_mask [sq, sk]
-    causal_mask = torch.triu(
-        torch.full((sq, sk), float('-inf'), dtype=torch.float32, device=attention_scores.device),
-        diagonal=1,
-    )
+    # causal_mask: use caller-provided mask when available (handles compressed KV),
+    # otherwise fall back to standard upper-triangular causal mask.
+    if causal_mask_override is not None:
+        causal_mask = causal_mask_override.to(dtype=torch.float32)  # [b, sq, sk]
+    else:
+        causal_mask = torch.triu(
+            torch.full(
+                (sq, sk), float('-inf'), dtype=torch.float32, device=attention_scores.device
+            ),
+            diagonal=1,
+        )
     # index_mask [b, sq, sk]
     index_mask = torch.full(
         (b, sq, sk), float("-inf"), dtype=torch.float32, device=causal_mask.device
     ).scatter_(-1, topk_indices, 0)
 
-    # [b, np, sq, skv] + [1, 1, sq, skv] -> [b, np, sq, skv]
-    attention_scores += causal_mask.view(1, 1, sq, sk)
+    # Apply causal mask to attention_scores
+    # causal_mask: [b, sq, sk] (from causal_mask_override) or [sq, sk] (from triu)
+    if causal_mask.dim() == 3:
+        attention_scores = attention_scores + causal_mask.unsqueeze(1)  # [b,1,sq,sk]
+    else:
+        attention_scores = attention_scores + causal_mask.view(1, 1, sq, sk)
     if sparse_loss:
         # [b, np, sq, sk] + [b, 1, sq, sk] -> [b, np, sq, sk]
         attention_scores += index_mask.view(b, 1, sq, sk)
         # [b, sq, sk] + [b, sq, sk] -> [b, sq, sk]
         index_scores += index_mask
 
+    # Identify rows where all KV positions are masked (e.g., early query positions with
+    # compress_ratio=4 have zero valid compressed KV entries). These rows would produce NaN
+    # from softmax(all -inf). We zero out their logits before softmax and mask out their
+    # contributions after, so NaN is never produced.
+    # row_valid: [b, sq] or [sq] — True if the row has at least one unmasked position.
+    row_valid = (causal_mask > float('-inf')).any(dim=-1)
+    if row_valid.dim() == 1:
+        # [sq] -> broadcast for attention_scores [b, np, sq, sk] and index_scores [b, sq, sk]
+        attn_row_mask = row_valid.view(1, 1, sq, 1)  # [1, 1, sq, 1]
+        idx_row_mask = row_valid.view(1, sq, 1)  # [1, sq, 1]
+    else:
+        # [b, sq]
+        attn_row_mask = row_valid.view(b, 1, sq, 1)  # [b, 1, sq, 1]
+        idx_row_mask = row_valid.view(b, sq, 1)  # [b, sq, 1]
+
+    # Zero out fully-masked rows before softmax so it produces valid uniform distribution
+    attention_scores = attention_scores.masked_fill(~attn_row_mask, 0.0)
+    index_scores = index_scores.masked_fill(~idx_row_mask, 0.0)
+
     # [b, np, sq, sk] -> [b, np, sq, sk]
     attention_scores = torch.nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32)
     # [b, sq, sk] -> [b, sq, sk]
     index_scores = torch.nn.functional.softmax(index_scores, dim=-1, dtype=torch.float32)
 
+    # Zero out invalid rows so they contribute nothing to loss/gradients
+    attention_scores = attention_scores * attn_row_mask.float()
+    index_scores = index_scores * idx_row_mask.float()
+
     # Sum attention scores across heads.
     # [batch, heads, seqlen_q, seqlen_k] -> [batch, seqlen_q, seqlen_k]
     attention_scores = attention_scores.sum(dim=1)
@@ -234,7 +284,9 @@ def compute_dsa_indexer_loss(
         torch.distributed.all_reduce(attention_scores.contiguous(), group=pg_collection.tp)
     # L1 normalize target on the last dimension. Doesn't use abs() because attention_scores are
     # obtained from softmax so they are already non-negative.
-    attention_scores = attention_scores / attention_scores.sum(dim=-1, keepdim=True)
+    attention_scores = attention_scores / (
+        attention_scores.sum(dim=-1, keepdim=True).clamp(min=1e-10)
+    )
 
     # Compute KL divergence: KL(target || index) = target(x) * log(target(x) / index(x))
     # kl_per_element [b, sq, sk]
@@ -310,7 +362,7 @@ def fused_qk_topk_naive(
     # [batch, seqlen, seqlen]
     index_scores = _compute_index_scores(q, weights, k)
     if mask is not None:
-        assert mask.dtype == index_scores.dtype, "Mask dtype must match index scores dtype"
+        mask = mask.to(index_scores.dtype)
         index_scores = index_scores + mask
 
     # =========================================
@@ -338,6 +390,7 @@ def fwd_fused_indexer_loss_naive(
         loss_coeff,
         sparse_loss,
         pg_collection,
+        causal_mask_override=mask,
     )
 
     return topk_indices, indexer_loss
@@ -355,6 +408,7 @@ def bwd_fused_indexer_loss_naive(
     sparse_loss,
     grad_loss,
     pg_collection,
+    causal_mask_override=None,
 ):
     """Naive implementation of backward pass for indexer loss."""
     index_scores = _compute_index_scores(q, weights, k)  # [B, Sq, Sk]
@@ -374,23 +428,30 @@ def bwd_fused_indexer_loss_naive(
     # Reshape to [b, np, sq, sk]
     attention_scores = attention_scores.reshape(b, np, sq, sk)
 
-    # causal_mask [sq, sk]
-    causal_mask = torch.triu(
-        torch.full((sq, sk), float('-inf'), dtype=torch.float32, device=attention_scores.device),
-        diagonal=1,
-    )
+    # causal_mask: use caller-provided mask when available (handles compressed KV),
+    # otherwise fall back to standard upper-triangular causal mask.
+    if causal_mask_override is not None:
+        causal_mask = causal_mask_override.to(dtype=torch.float32)  # [b, sq, sk]
+    else:
+        causal_mask = torch.triu(
+            torch.full(
+                (sq, sk), float('-inf'), dtype=torch.float32, device=attention_scores.device
+            ),
+            diagonal=1,
+        )
     # index_mask [b, sq, sk]
     index_mask = torch.full(
         (b, sq, sk), float("-inf"), dtype=torch.float32, device=causal_mask.device
     ).scatter_(-1, topk_indices, 0)
 
     # Apply causal mask to both attention and index scores
-    # [b, np, sq, skv] + [1, 1, sq, skv] -> [b, np, sq, skv]
-    attention_scores = attention_scores + causal_mask.view(1, 1, sq, sk)
-    # [b, sq, sk] + [1, sq, sk] -> [b, sq, sk]
-    index_scores = index_scores + causal_mask.unsqueeze(0)
-    # Free causal_mask - no longer needed
-    del causal_mask
+    # attention_scores: [b, np, sq, sk], causal_mask: [b, sq, sk] or [sq, sk]
+    if causal_mask.dim() == 3:
+        attention_scores = attention_scores + causal_mask.unsqueeze(1)  # [b,1,sq,sk]
+        index_scores = index_scores + causal_mask  # [b,sq,sk]
+    else:
+        attention_scores = attention_scores + causal_mask.view(1, 1, sq, sk)
+        index_scores = index_scores + causal_mask.unsqueeze(0)
 
     if sparse_loss:
         # [b, np, sq, sk] + [b, 1, sq, sk] -> [b, np, sq, sk]
@@ -398,7 +459,24 @@ def bwd_fused_indexer_loss_naive(
         # [b, sq, sk] + [b, sq, sk] -> [b, sq, sk]
         index_scores = index_scores + index_mask
 
-    # Compute softmax for both
+    # Identify rows where all KV positions are masked (e.g., early query positions with
+    # compress_ratio=4 have zero valid compressed KV entries). Zero out their logits before
+    # softmax and mask out contributions after, so NaN is never produced.
+    row_valid = (causal_mask > float('-inf')).any(dim=-1)
+    # Free causal_mask - no longer needed
+    del causal_mask
+    if row_valid.dim() == 1:
+        attn_row_mask = row_valid.view(1, 1, sq, 1)
+        idx_row_mask = row_valid.view(1, sq, 1)
+    else:
+        attn_row_mask = row_valid.view(b, 1, sq, 1)
+        idx_row_mask = row_valid.view(b, sq, 1)
+
+    # Zero out fully-masked rows before softmax
+    attention_scores = attention_scores.masked_fill(~attn_row_mask, 0.0)
+    index_scores = index_scores.masked_fill(~idx_row_mask, 0.0)
+
+    # Compute softmax
     attention_scores_softmax = torch.nn.functional.softmax(
         attention_scores, dim=-1, dtype=torch.float32
     )
@@ -409,6 +487,10 @@ def bwd_fused_indexer_loss_naive(
     # Free index_scores - no longer needed after softmax
     del index_scores
 
+    # Zero out invalid rows so they contribute nothing to gradients
+    attention_scores_softmax = attention_scores_softmax * attn_row_mask.float()
+    index_scores_softmax = index_scores_softmax * idx_row_mask.float()
+
     # Sum attention scores across heads: [b, np, sq, sk] -> [b, sq, sk]
     attention_scores_sum = attention_scores_softmax.sum(dim=1)
     # Free attention_scores_softmax
@@ -421,7 +503,7 @@ def bwd_fused_indexer_loss_naive(
     # L1 normalize
     attention_scores_normalized = attention_scores_sum / attention_scores_sum.sum(
         dim=-1, keepdim=True
-    )
+    ).clamp(min=1e-10)
     # Free attention_scores_sum - no longer needed after normalization
     del attention_scores_sum
 
@@ -452,19 +534,31 @@ def bwd_fused_indexer_loss_naive(
 
     # Zero out gradients for masked positions
     # Create a mask for valid (non-masked) positions
-    # Causal mask: position (i, j) is valid if j <= i
-    causal_valid_mask = torch.tril(
-        torch.ones((sq, sk), device=q.device, dtype=torch.bool)
-    )  # [sq, sk]
+    if causal_mask_override is not None:
+        # Derive valid mask from the causal_mask_override: valid where mask == 0
+        _cm = causal_mask_override.to(dtype=torch.float32)
+        if _cm.dim() == 2:
+            _cm = _cm.unsqueeze(0)  # [1, sq, sk]
+        causal_valid_mask = (_cm == 0).squeeze(0) if _cm.shape[0] == 1 else (_cm == 0)
+    else:
+        # Standard causal: position (i, j) is valid if j <= i
+        causal_valid_mask = torch.tril(
+            torch.ones((sq, sk), device=q.device, dtype=torch.bool)
+        )  # [sq, sk]
+
+    if causal_valid_mask.dim() == 2:
+        causal_valid_mask = causal_valid_mask.unsqueeze(0)
+    causal_valid_mask = causal_valid_mask.expand(b, sq, sk)
+
     if sparse_loss:
         # Also apply index mask - only topk positions are valid
         index_valid_mask = index_mask == 0  # [b, sq, sk]
         del index_mask  # Free index_mask immediately after use
-        valid_mask = causal_valid_mask.unsqueeze(0) & index_valid_mask  # [b, sq, sk]
+        valid_mask = causal_valid_mask & index_valid_mask  # [b, sq, sk]
         del index_valid_mask
     else:
         del index_mask  # Free index_mask even if not used for sparse_loss
-        valid_mask = causal_valid_mask.unsqueeze(0).expand(b, sq, sk)  # [b, sq, sk]
+        valid_mask = causal_valid_mask  # [b, sq, sk]
     del causal_valid_mask
 
     grad_index_scores_logits = grad_index_scores_logits * valid_mask.float()
@@ -543,7 +637,7 @@ def forward(
         )
 
         # Save for backward (recomputation strategy)
-        ctx.save_for_backward(q, weights, k, query, key, topk_indices)
+        ctx.save_for_backward(q, weights, k, query, key, topk_indices, mask)
         ctx.softmax_scale = softmax_scale
         ctx.loss_coeff = loss_coeff
         ctx.sparse_loss = sparse_loss
@@ -556,7 +650,7 @@ def backward(ctx, grad_topk_indices, grad_loss):
         """
         Backward: Recompute what we need.
         """
-        q, weights, k, query, key, topk_indices = ctx.saved_tensors
+        q, weights, k, query, key, topk_indices, mask = ctx.saved_tensors
 
         grad_q, grad_weights, grad_k = bwd_fused_indexer_loss_naive(
             q,
@@ -570,6 +664,7 @@ def backward(ctx, grad_topk_indices, grad_loss):
             ctx.sparse_loss,
             grad_loss,
             ctx.pg_collection,
+            causal_mask_override=mask,
         )
 
         # query and key are detached in forward, so return None for their gradients
@@ -778,10 +873,12 @@ def __init__(
 
     def _apply_rope(self, x: torch.Tensor, rotary_pos_emb: torch.Tensor, mscale: float):
         """Apply RoPE to the input tensor."""
-        # x_nope [seqlen, batch, *, index_head_dim - qk_pos_emb_head_dim]
         # x_pe   [seqlen, batch, *, qk_pos_emb_head_dim]
-        x_nope, x_pe = torch.split(
-            x, [self.index_head_dim - self.qk_pos_emb_head_dim, self.qk_pos_emb_head_dim], dim=-1
+        # x_nope [seqlen, batch, *, index_head_dim - qk_pos_emb_head_dim]
+        # To align with DeepSeek's implementation,
+        # x_pe is placed at the front, and x_nope is placed at the back.
+        x_pe, x_nope = torch.split(
+            x, [self.qk_pos_emb_head_dim, self.index_head_dim - self.qk_pos_emb_head_dim], dim=-1
         )
         x_pe = apply_rotary_pos_emb(
             x_pe,
@@ -790,9 +887,12 @@ def _apply_rope(self, x: torch.Tensor, rotary_pos_emb: torch.Tensor, mscale: flo
             cu_seqlens=None,
             mscale=mscale,
             cp_group=self.pg_collection.cp,
+            # This flag is for the MLA-style interleaving in RoPE.
+            # Set it to False, as indexer does not apply interleaved RoPE.
+            mla_rotary_interleaved=False,
         )
         # [seqlen, batch, *, index_head_dim]
-        x = torch.cat([x_nope, x_pe], dim=-1)
+        x = torch.cat([x_pe, x_nope], dim=-1)
         return x
 
     def forward_before_topk(
diff --git a/megatron/core/transformer/hyper_connection.py b/megatron/core/transformer/hyper_connection.py
new file mode 100644
index 00000000000..900a24ea4a6
--- /dev/null
+++ b/megatron/core/transformer/hyper_connection.py
@@ -0,0 +1,734 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import math
+from typing import TYPE_CHECKING, Optional, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import nvtx_decorator
+
+if TYPE_CHECKING:
+    from megatron.core.tensor_parallel.random import CheckpointManager
+
+
+@torch.compile
+def _sinkhorn_iterations(input_logits: Tensor, num_iterations: int, eps: float) -> Tensor:
+    row_max = input_logits.max(dim=-1, keepdim=True).values
+    M = torch.exp(input_logits - row_max)
+    for _ in range(num_iterations):
+        M = M / M.sum(dim=-1, keepdim=True).clamp(min=eps)
+        M = M / M.sum(dim=-2, keepdim=True).clamp(min=eps)
+    return M
+
+
+class SinkhornKnopp(torch.autograd.Function):
+    """Sinkhorn-Knopp projection to doubly stochastic matrix.
+
+    This is an autograd.Function because the iterative forward is re-executed
+    during backward (under torch.enable_grad) so that PyTorch's autograd can
+    differentiate through it without storing all intermediate iteration states.
+    """
+
+    @staticmethod
+    def forward(ctx, input_logits: Tensor, num_iterations: int, eps: float = 1e-6) -> Tensor:
+        """Run Sinkhorn iterations and save inputs for backward recomputation."""
+        M = _sinkhorn_iterations(input_logits, num_iterations, eps)
+        ctx.save_for_backward(input_logits)
+        ctx.num_iterations = num_iterations
+        ctx.eps = eps
+        return M
+
+    @staticmethod
+    def backward(ctx, grad_output: Tensor):
+        """Recompute forward under enable_grad and back-propagate."""
+        (input_logits,) = ctx.saved_tensors
+        with torch.enable_grad():
+            logits = input_logits.detach().requires_grad_(True)
+            M = _sinkhorn_iterations(logits, ctx.num_iterations, ctx.eps)
+            M.backward(grad_output)
+        return logits.grad, None, None
+
+
+def native_sinkhorn(input_logits: Tensor, num_iterations: int, eps: float = 1e-6) -> Tensor:
+    """Native Sinkhorn-Knopp (autograd.Function wrapper)."""
+    return SinkhornKnopp.apply(input_logits, num_iterations, eps)
+
+
+@torch.compile
+def native_h_aggregate(x: Tensor, h_pre: Tensor) -> Tensor:
+    """Native n-stream weighted aggregation: out = sum_j(h_pre_j * x_j)."""
+    return (x * h_pre.unsqueeze(-1)).sum(dim=2)
+
+
+@torch.compile
+def native_h_post_bda(
+    h_res: Tensor, original_residual: Tensor, h_post: Tensor, x: Tensor, bias: Optional[Tensor]
+) -> Tensor:
+    """Native H_res @ residual + H_post * (x [+ bias])."""
+    s, b, n, C = original_residual.shape
+    h_res_batched = h_res.view(s * b, n, n)
+    residual_batched = original_residual.view(s * b, n, C)
+    mixed = torch.bmm(h_res_batched, residual_batched).view(s, b, n, C)
+    x_expanded = h_post.unsqueeze(-1) * x.unsqueeze(2)
+    if bias is not None:
+        bias_expanded = h_post.unsqueeze(-1) * bias.view(1, 1, 1, C)
+        return x_expanded + bias_expanded + mixed
+    return x_expanded + mixed
+
+
+@torch.compile
+def native_proj_rms(x: Tensor, weight: Tensor, eps: float = 1e-6) -> Tuple[Tensor, Tensor]:
+    """Native fused projection + RMS normalization."""
+    proj = torch.matmul(x, weight.t())
+    norm = x.norm(dim=-1, keepdim=True)
+    K = x.shape[-1]
+    v = norm / math.sqrt(K) + eps
+    r = 1.0 / v
+    return proj, r
+
+
+@torch.compile
+def learned_output_contract(
+    hidden_states: Tensor, head_fn: Tensor, base: Tensor, scale: Tensor, n: int, eps: float
+) -> Tensor:
+    """Learned output contraction: n-stream → 1-stream via sigmoid-gated weighted sum."""
+    dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    head_fn = head_fn.to(torch.float32)
+    base = base.to(torch.float32)
+    scale = scale.to(torch.float32)
+    rsqrt = torch.rsqrt(hidden_states.square().mean(-1, keepdim=True) + eps)
+    mixes = F.linear(hidden_states, head_fn) * rsqrt
+    pre = torch.sigmoid(mixes * scale + base) + 1e-6
+    y = torch.sum(pre.unsqueeze(-1) * hidden_states.view(*hidden_states.shape[:-1], n, -1), dim=-2)
+    return y.to(dtype)
+
+
+# ============================================================================
+# HyperConnectionModule
+# ============================================================================
+
+
+# TODO: keep hyper connection in fp32 computation
+class HyperConnectionModule(MegatronModule):
+    """
+    Unified mHC (Manifold-Constrained Hyper-Connections) module.
+
+    Implements the complete mHC propagation:
+        x_{l+1} = H_res @ x_l + H_post^T @ F(H_pre @ x_l)
+
+    This module handles:
+    1. Computing learnable mappings: H_pre, H_post, H_res (with Sinkhorn-Knopp projection)
+    2. Aggregation: n-stream → 1-stream (H_pre @ x)
+    3. Expansion: 1-stream → n-stream (H_post^T @ output)
+    4. Residual merge: H_res @ x + expanded_output
+    5. Block-level expand/contract for TransformerBlock boundaries
+
+    Args:
+        config: TransformerConfig with hyper-connection fields
+        layer_number: Current layer index for initialization
+    """
+
+    def __init__(self, config: TransformerConfig, layer_number: int):
+        super().__init__(config)
+        self.config = config
+        self.layer_number = layer_number
+        self.n = config.num_residual_streams
+        self.hidden_size = config.hidden_size
+        self.sinkhorn_iterations = config.mhc_sinkhorn_iterations
+
+        # Projection weights for dynamic mappings
+        # Input: [s, b, n*C] -> Output: n^2 + 2n values per token
+        # - H_pre: n values
+        # - H_post: n values
+        # - H_res: n^2 values (before Sinkhorn projection)
+        self.mapping_proj = nn.Linear(
+            self.n * self.hidden_size, self.n * self.n + 2 * self.n, bias=False
+        )
+
+        init_alpha = config.mhc_init_gating_factor
+        # Learnable scaling factors (Eq. 5 in paper)
+        self.alpha_pre = nn.Parameter(torch.full((1,), init_alpha))
+        self.alpha_post = nn.Parameter(torch.full((1,), init_alpha))
+        self.alpha_res = nn.Parameter(torch.full((1,), init_alpha))
+
+        # Static bias terms
+        self.bias = nn.Parameter(torch.zeros(self.n * self.n + 2 * self.n))
+        self.norm_eps = 1e-6
+
+        # Choose implementation: fused cuTile kernels vs reference modules.
+        # Both paths expose the same call signatures so the rest of the code
+        # is implementation-agnostic.
+        if config.use_fused_mhc:
+            from megatron.core.fusions.fused_mhc_kernels import (
+                fused_h_aggregate,
+                fused_h_post_bda,
+                fused_proj_rms,
+                fused_sinkhorn,
+            )
+
+            self._sinkhorn_op = fused_sinkhorn
+            self._h_aggregate_op = fused_h_aggregate
+            self._h_post_bda_op = fused_h_post_bda
+            self._proj_rms_op = fused_proj_rms
+        else:
+            self._sinkhorn_op = native_sinkhorn
+            self._h_aggregate_op = native_h_aggregate
+            self._h_post_bda_op = native_h_post_bda
+            self._proj_rms_op = native_proj_rms
+
+        self._init_weights()
+
+    def _init_weights(self) -> None:
+        """Initialize weights for stable training."""
+        nn.init.xavier_uniform_(self.mapping_proj.weight)
+
+        # Set sequence_parallel attribute on parameters for gradient synchronization
+        # across TP ranks when sequence_parallel is enabled.
+        # This is required because HyperConnectionModule uses non-TP-aware layers
+        # (nn.Linear, nn.RMSNorm) whose gradients need to be all-reduced.
+        if self.config.sequence_parallel:
+            setattr(self.mapping_proj.weight, 'sequence_parallel', True)
+            setattr(self.alpha_pre, 'sequence_parallel', True)
+            setattr(self.alpha_post, 'sequence_parallel', True)
+            setattr(self.alpha_res, 'sequence_parallel', True)
+            setattr(self.bias, 'sequence_parallel', True)
+
+    def _projection_and_get_norm(self, x: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Projection + RMS normalization.
+
+        Args:
+            x: [s, b, n*C] - n-stream hidden states
+        """
+        s, b, nC = x.shape
+        x_2d = x.reshape(s * b, nC)
+        proj, r = self._proj_rms_op(x_2d, self.mapping_proj.weight, self.norm_eps)
+        return proj.view(s, b, -1), r.view(s, b, 1)
+
+    @torch.compile
+    def _compute_h(self, proj: Tensor, r: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Compute h from projected hidden states and scaling factors.
+
+        Args:
+            proj: [s, b, n^2 + 2n] - projected hidden states
+            r: [s, b, 1] - scaling factors
+
+        Returns:
+            h_pre: [s, b, n] - aggregation weights
+            h_post: [s, b, n] - expansion weights
+            h_res: [s, b, n^2] - residual mixing logits
+        """
+        alpha_ = torch.cat(
+            [
+                self.alpha_pre.expand(self.n),
+                self.alpha_post.expand(self.n),
+                self.alpha_res.expand(self.n * self.n),
+            ],
+            dim=-1,
+        )
+        h = r * proj * alpha_ + self.bias
+        # H_pre = σ(α_pre * (θ_pre @ x̃) + b_pre)
+        h_pre = h[..., : self.n].sigmoid()  # [s, b, n]
+
+        # H_post = 2σ(α_post * (θ_post @ x̃) + b_post)
+        h_post = h[..., self.n : 2 * self.n].sigmoid() * 2  # [s, b, n]
+        h_res = h[..., 2 * self.n :]
+        return h_pre, h_post, h_res
+
+    @nvtx_decorator(message="HyperConnection::compute_mappings")
+    def compute_mappings(self, x: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Compute mHC mappings from input hidden states.
+
+        Reference: Eq. (5) and (8) in mHC paper
+
+        Args:
+            x: [s, b, n*C] - n-stream hidden states
+
+        Returns:
+            h_pre: [s, b, n] - aggregation weights (sigmoid activated)
+            h_post: [s, b, n] - expansion weights (2*sigmoid activated)
+            h_res: [s, b, n, n] - residual mixing matrix (doubly stochastic)
+        """
+        s, b, _ = x.shape
+        with torch.cuda.nvtx.range("HyperConnection::projection_and_get_norm"):
+            proj, r = self._projection_and_get_norm(x)
+        with torch.cuda.nvtx.range("HyperConnection::compute_h"):
+            h_pre, h_post, h_res = self._compute_h(proj, r)
+        h_res = self._sinkhorn_op(
+            h_res.view(s, b, self.n, self.n), self.sinkhorn_iterations, self.norm_eps
+        )  # [s, b, n, n]
+
+        return h_pre, h_post, h_res
+
+    @torch.compile
+    def _apply_h_post(self, x: Tensor, h_post: Tensor) -> Tensor:
+        """
+        Core implementation of H_post application to a single tensor.
+
+        Computes: H_post^T @ x
+
+        Args:
+            x: Input tensor, can be either:
+               - [s, b, C] - standard hidden states
+               - [C] - bias tensor (will be broadcast)
+            h_post: [s, b, n] - expansion weights
+
+        Returns:
+            output: [s, b, n*C] - expanded tensor
+        """
+        n = self.n
+        s, b, _ = h_post.shape
+
+        if x.dim() == 1:
+            # x is bias with shape [C], need to broadcast to [s, b, 1, C]
+            C = x.shape[0]
+            x_expanded = x.unsqueeze(0).unsqueeze(0).unsqueeze(0).expand(s, b, 1, C)
+        else:
+            # x is [s, b, C]
+            C = x.shape[-1]
+            x_expanded = x.unsqueeze(2)  # [s, b, 1, C]
+
+        # h_post^T @ x : [s, b, n, 1] * [s, b, 1, C] -> [s, b, n, C]
+        # Using broadcast multiply instead of einsum
+        result = h_post.unsqueeze(-1) * x_expanded
+        return result.view(s, b, n * C)
+
+    @nvtx_decorator(message="HyperConnection::apply_h_post")
+    def apply_h_post(
+        self,
+        x_with_bias: Tuple[Tensor, Optional[Tensor]],
+        h_post: Tensor,
+        manager: Optional['CheckpointManager'] = None,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        """
+        Apply H_post to x and optionally bias, with optional checkpointing.
+
+        This is the unified entry point that handles both normal execution
+        and checkpoint-based execution for memory efficiency.
+
+        Args:
+            x_with_bias: Tuple of (x, bias) where:
+                - x: [s, b, C] - hidden states
+                - bias: [C] or None - optional bias tensor
+            h_post: [s, b, n] - expansion weights
+            manager: Optional CheckpointManager for checkpoint management.
+                When provided, wraps _apply_h_post with CheckpointWithoutOutput.
+
+        Returns:
+            Tuple of (x_out, bias_out) where:
+                - x_out: [s, b, n*C] - expanded hidden states
+                - bias_out: [s, b, n*C] or None - expanded bias if input bias was not None
+        """
+        x, bias = x_with_bias
+
+        if manager is not None:
+            from megatron.core.tensor_parallel.random import CheckpointWithoutOutput
+
+            # Checkpoint _apply_h_post to discard the output
+            x_out = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(
+                self._apply_h_post, x, h_post
+            )
+
+            # Checkpoint _apply_h_post for bias if not None
+            if bias is not None:
+                bias_out = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(
+                    self._apply_h_post, bias, h_post
+                )
+            else:
+                bias_out = None
+        else:
+            # Normal execution without checkpoint
+            x_out = self._apply_h_post(x, h_post)
+            bias_out = self._apply_h_post(bias, h_post) if bias is not None else None
+
+        return x_out, bias_out
+
+    def aggregate(self, x: Tensor, h_pre: Tensor) -> Tensor:
+        """
+        Aggregate n-stream to 1-stream.
+
+        Args:
+            x: [s, b, n*C] - n-stream hidden states
+            h_pre: [s, b, n] - aggregation weights
+
+        Returns:
+            aggregated: [s, b, C] - single stream hidden states
+        """
+        s, b, _ = x.shape
+        C = self.hidden_size
+        x_streams = x.view(s, b, self.n, C)
+        return self._h_aggregate_op(x_streams, h_pre)
+
+    @torch.compile
+    def apply_h_res(self, h_res: Tensor, residual: Tensor) -> Tensor:
+        """
+        Apply H_res to residual using H_res weights.
+
+        Computes: H_res @ residual
+
+        Args:
+            h_res: [s, b, n, n] - residual mixing matrix
+            residual: [s, b, n*C] - n-stream hidden states
+        """
+        s, b, _ = residual.shape
+        n = self.n
+        C = self.hidden_size
+
+        # Reshape for bmm: [s, b, n, n] -> [s*b, n, n]
+        h_res_batched = h_res.view(s * b, n, n)
+        # [s, b, n*C] -> [s, b, n, C] -> [s*b, n, C]
+        residual_batched = residual.view(s, b, n, C).view(s * b, n, C)
+
+        # Batch matrix multiply: [s*b, n, n] @ [s*b, n, C] -> [s*b, n, C]
+        mixed = torch.bmm(h_res_batched, residual_batched)
+
+        return mixed.view(s, b, n * C)
+
+    def forward(
+        self, hidden_states: Tensor, mhc_recompute_manager: Optional['CheckpointManager'] = None
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Full mHC forward pass.
+
+        Args:
+            hidden_states: [s, b, n*C] - n-stream hidden states
+            mhc_recompute_manager: Optional CheckpointManager for checkpoint management.
+                When provided, uses _forward_with_checkpoint for memory-efficient execution.
+
+        Returns:
+            aggregated: [s, b, C] - aggregated input for layer computation
+            h_res: [s, b, n, n] - residual mixing matrix (for fused kernel)
+            h_post: [s, b, n] - expansion weights
+        """
+        if mhc_recompute_manager is not None:
+            return self._forward_with_checkpoint(hidden_states, mhc_recompute_manager)
+        else:
+            return self._forward_normal(hidden_states)
+
+    def _forward_normal(self, hidden_states: Tensor) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Normal forward pass without checkpointing.
+
+        Args:
+            hidden_states: [s, b, n*C] - n-stream hidden states
+
+        Returns:
+            aggregated: [s, b, C] - aggregated input for layer computation
+            h_res: [s, b, n, n] - residual mixing matrix (for fused kernel)
+            h_post: [s, b, n] - expansion weights
+        """
+        # Compute mappings
+        h_pre, h_post, h_res = self.compute_mappings(hidden_states)
+
+        # Aggregate for layer input
+        with torch.cuda.nvtx.range("HyperConnection::aggregate"):
+            aggregated = self.aggregate(hidden_states, h_pre)
+
+        return aggregated, h_res, h_post
+
+    def _forward_with_checkpoint(
+        self, hidden_states: Tensor, manager: 'CheckpointManager'
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Forward pass with checkpointing for memory efficiency.
+
+        compute_mappings is called directly (not checkpointed) since its outputs
+        (h_pre, h_post, h_res) are needed downstream. Only aggregate is wrapped with
+        CheckpointWithoutOutput and auto-registered to the manager.
+        apply_h_res is deferred to fused_h_res_h_post_bda for kernel fusion.
+
+        Args:
+            hidden_states: [s, b, n*C] - n-stream hidden states
+            manager: CheckpointManager for unified recomputation
+
+        Returns:
+            aggregated: [s, b, C] - aggregated input for layer computation
+            h_res: [s, b, n, n] - residual mixing matrix (for fused kernel)
+            h_post: [s, b, n] - expansion weights
+        """
+        from megatron.core.tensor_parallel.random import CheckpointWithoutOutput
+
+        h_pre, h_post, h_res = self.compute_mappings(hidden_states)
+
+        # Checkpoint aggregate - auto-registers to manager
+        aggregated = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(
+            self.aggregate, hidden_states, h_pre
+        )
+
+        return aggregated, h_res, h_post
+
+    # ==================== Block-level utilities ====================
+
+    @staticmethod
+    def input_expand(x: Tensor, n: int) -> Tensor:
+        """
+        Expand 1-stream to n-stream at TransformerBlock entry.
+
+        Simple replication strategy: each stream initialized as a copy of input.
+
+        Args:
+            x: [s, b, C] - single stream hidden states
+            n: Number of residual streams
+
+        Returns:
+            expanded: [s, b, n*C] - n-stream hidden states
+        """
+        s, b, C = x.shape
+        # Replicate input to n streams
+        expanded = x.unsqueeze(2).expand(s, b, n, C).contiguous()
+        return expanded.view(s, b, n * C)
+
+    @staticmethod
+    def output_contract(x: Tensor, n: int) -> Tensor:
+        """
+        Contract n-stream to 1-stream at TransformerBlock exit.
+
+        Simple averaging strategy: average all streams.
+
+        Args:
+            x: [s, b, n*C] - n-stream hidden states
+            n: Number of residual streams
+
+        Returns:
+            contracted: [s, b, C] - single stream hidden states
+        """
+        s, b, nC = x.shape
+        C = nC // n
+        # Average all streams
+        x_streams = x.view(s, b, n, C)
+        contracted = x_streams.mean(dim=2)
+        return contracted
+
+    # ==================== Fused kernel placeholder ====================
+
+    @nvtx_decorator(message="HyperConnection::fused_h_res_h_post_bda")
+    def fused_h_res_h_post_bda(
+        self,
+        h_res: Tensor,
+        original_residual: Tensor,
+        h_post: Tensor,
+        layer_output_with_bias: Tuple[Tensor, Optional[Tensor]],
+        dropout_prob: float,
+        training: bool,
+        fused: bool,
+        manager: Optional['CheckpointManager'] = None,
+    ) -> Tensor:
+        """
+        Fused kernel combining apply_h_res, apply_h_post and bias-dropout-add.
+
+        This is a placeholder for future kernel fusion optimization.
+        Currently implements the operations sequentially using native PyTorch.
+
+        The computation flow is:
+            1. mixed = H_res @ original_residual (apply_h_res)
+            2. expanded = H_post^T @ layer_output (apply_h_post)
+            3. output = dropout(expanded + bias) + mixed (bias-dropout-add)
+
+        Args:
+            h_res: [s, b, n, n] - residual mixing matrix
+            original_residual: [s, b, n*C] - n-stream hidden states (before H_res applied)
+            h_post: [s, b, n] - expansion weights
+            layer_output_with_bias: Tuple of (x, bias) where:
+                - x: [s, b, C] - layer output (attention or MLP output)
+                - bias: [C] or None - optional bias tensor
+            dropout_prob: Dropout probability
+            training: Whether in training mode
+            fused: Whether to use fused BDA implementation
+            manager: Optional CheckpointManager for checkpoint management.
+                When provided, each operation is wrapped with CheckpointWithoutOutput.
+
+        Returns:
+            output: [s, b, n*C] - final output after all operations
+        """
+        if manager is not None:
+            return self._fused_h_res_h_post_bda_with_checkpoint(
+                h_res,
+                original_residual,
+                h_post,
+                layer_output_with_bias,
+                dropout_prob,
+                training,
+                fused,
+                manager,
+            )
+        else:
+            return self._fused_h_res_h_post_bda_native(
+                h_res,
+                original_residual,
+                h_post,
+                layer_output_with_bias,
+                dropout_prob,
+                training,
+                fused,
+            )
+
+    def _fused_h_res_h_post_bda_native(
+        self,
+        h_res: Tensor,
+        original_residual: Tensor,
+        h_post: Tensor,
+        layer_output_with_bias: Tuple[Tensor, Optional[Tensor]],
+        dropout_prob: float,
+        training: bool,
+        fused: bool,
+    ) -> Tensor:
+        """
+        h_res, h_post and bda.
+
+        When dropout is zero (or inference), uses a single fused/reference kernel
+        for H_res @ residual + H_post * (x + bias). Falls back to unfused
+        implementation when dropout is needed.
+
+        Args:
+            h_res: [s, b, n, n] - residual mixing matrix
+            original_residual: [s, b, n*C] - n-stream hidden states
+            h_post: [s, b, n] - expansion weights
+            layer_output_with_bias: Tuple of (x, bias)
+            dropout_prob: Dropout probability
+            training: Whether in training mode
+            fused: Whether to use fused BDA implementation
+
+        Returns:
+            output: [s, b, n*C] - final output
+        """
+        x, bias = layer_output_with_bias
+
+        if dropout_prob == 0.0 or not training:
+            s, b, _ = original_residual.shape
+            n = self.n
+            C = self.hidden_size
+            orig_reshaped = original_residual.view(s, b, n, C)
+            output = self._h_post_bda_op(h_res, orig_reshaped, h_post, x, bias)
+            return output.view(s, b, n * C)
+
+        from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+
+        with torch.cuda.nvtx.range("HyperConnection::apply_h_res"):
+            mixed = self.apply_h_res(h_res, original_residual)
+        with torch.cuda.nvtx.range("HyperConnection::apply_h_post"):
+            x_expanded = self._apply_h_post(x, h_post)
+            bias_expanded = self._apply_h_post(bias, h_post) if bias is not None else None
+        bda_func = get_bias_dropout_add(training, fused)
+        with torch.cuda.nvtx.range("HyperConnection::bda"):
+            output = bda_func((x_expanded, bias_expanded), mixed, dropout_prob)
+        return output
+
+    @nvtx_decorator(message="HyperConnection::fused_h_res_h_post_bda_with_checkpoint")
+    def _fused_h_res_h_post_bda_with_checkpoint(
+        self,
+        h_res: Tensor,
+        original_residual: Tensor,
+        h_post: Tensor,
+        layer_output_with_bias: Tuple[Tensor, Optional[Tensor]],
+        dropout_prob: float,
+        training: bool,
+        fused: bool,
+        manager: 'CheckpointManager',
+    ) -> Tensor:
+        """
+        Checkpointed variant of _fused_h_res_h_post_bda_native.
+
+        Wraps compute in CheckpointWithoutOutput for activation memory savings.
+        Cannot reuse _native directly because checkpoint requires all args to be
+        positional Tensors; tuple/Optional/scalar args are unpacked or captured
+        via closure instead.
+
+        Args:
+            h_res: [s, b, n, n] - residual mixing matrix
+            original_residual: [s, b, n*C] - n-stream hidden states
+            h_post: [s, b, n] - expansion weights
+            layer_output_with_bias: Tuple of (x, bias)
+            dropout_prob: Dropout probability
+            training: Whether in training mode
+            fused: Whether to use fused BDA implementation
+            manager: CheckpointManager for checkpoint management
+
+        Returns:
+            output: [s, b, n*C] - final output
+        """
+        from megatron.core.tensor_parallel.random import CheckpointWithoutOutput
+
+        x, bias = layer_output_with_bias
+        n = self.n
+        C = self.hidden_size
+
+        # Fast path: no dropout — use fused/reference h_post_bda kernel (same as _native)
+        if dropout_prob == 0.0 or not training:
+
+            def _fused_wrapper(h_res, original_residual, h_post, x, *optional_bias):
+                s, b, _ = original_residual.shape
+                orig_reshaped = original_residual.view(s, b, n, C)
+                b_arg = optional_bias[0] if optional_bias else None
+                return self._h_post_bda_op(h_res, orig_reshaped, h_post, x, b_arg).view(s, b, n * C)
+
+            ckpt = CheckpointWithoutOutput(ckpt_manager=manager)
+            if bias is not None:
+                output = ckpt.checkpoint(_fused_wrapper, h_res, original_residual, h_post, x, bias)
+            else:
+                output = ckpt.checkpoint(_fused_wrapper, h_res, original_residual, h_post, x)
+
+        # Slow path: dropout required — fused kernel does not support dropout,
+        # fall back to sequential apply_h_res + apply_h_post + bda
+        else:
+            from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+
+            bda_func = get_bias_dropout_add(training, fused)
+            has_bias = bias is not None
+
+            def _native_wrapper(h_res, original_residual, h_post, x, *optional_bias):
+                with torch.cuda.nvtx.range("HyperConnection::apply_h_res"):
+                    mixed = self.apply_h_res(h_res, original_residual)
+                with torch.cuda.nvtx.range("HyperConnection::apply_h_post"):
+                    x_expanded = self._apply_h_post(x, h_post)
+                    if has_bias:
+                        bias_expanded = self._apply_h_post(optional_bias[0], h_post)
+                    else:
+                        bias_expanded = None
+                with torch.cuda.nvtx.range("HyperConnection::bda"):
+                    output = bda_func((x_expanded, bias_expanded), mixed, dropout_prob)
+                return output
+
+            ckpt = CheckpointWithoutOutput(ckpt_manager=manager)
+            if has_bias:
+                output = ckpt.checkpoint(_native_wrapper, h_res, original_residual, h_post, x, bias)
+            else:
+                output = ckpt.checkpoint(_native_wrapper, h_res, original_residual, h_post, x)
+
+        return output
+
+
+# ==================== Checkpoint utilities for mHC ====================
+
+
+class HyperConnectionCheckpoint:
+    """
+    Checkpoint utility for mHC intermediate activations.
+
+    Implements the paper's "recomputing strategy" to reduce memory footprint
+    by discarding intermediate n-stream activations and recomputing on-the-fly.
+    """
+
+    @staticmethod
+    def compute_optimal_block_size(num_layers: int, num_streams: int) -> int:
+        """
+        Compute optimal recomputation block size.
+
+        From paper Eq. (20): L_r^* ≈ sqrt(nL/(n+2))
+
+        Args:
+            num_layers: Total number of transformer layers
+            num_streams: Number of residual streams (n)
+
+        Returns:
+            block_size: Optimal block size for checkpointing
+        """
+        block_size = int(math.sqrt(num_streams * num_layers / (num_streams + 2)))
+        return max(1, block_size)
diff --git a/megatron/core/transformer/linear_cross_entropy.py b/megatron/core/transformer/linear_cross_entropy.py
new file mode 100644
index 00000000000..e7afe326e1c
--- /dev/null
+++ b/megatron/core/transformer/linear_cross_entropy.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+from typing import Literal, Optional, Tuple, Union
+
+import torch
+
+from megatron.core import tensor_parallel
+from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy
+
+
+class LinearCrossEntropyModule(tensor_parallel.ColumnParallelLinear):
+    """
+    A module that combines a ColumnParallelLinear layer with fused
+    linear + cross-entropy loss computation over a tensor-parallel vocabulary.
+    """
+
+    def forward(
+        self,
+        input_: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        runtime_gather_output: Optional[bool] = None,
+        output_cross_entropy_loss: bool = False,
+        labels: Optional[torch.Tensor] = None,
+        reduction: Literal["none", "sum", "mean"] = "none",
+        ignore_index: int = -100,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """Run either the plain ColumnParallelLinear or fused linear+cross-entropy."""
+        if output_cross_entropy_loss:
+            assert labels is not None, "labels cannot be None when outputting cross-entropy loss."
+            return self._compute_linear_and_cross_entropy_loss(
+                hidden=input_,
+                weight=weight if weight is not None else self.weight,
+                labels=labels,
+                reduction=reduction,
+                ignore_index=ignore_index,
+            )
+
+        # Fall back to standard ColumnParallelLinear forward.
+        # ColumnParallelLinear.forward returns (output, bias) or just output
+        # depending on configuration, so keep the return type as Tensor.
+        return super().forward(input_, weight, runtime_gather_output)
+
+    def _compute_linear_and_cross_entropy_loss(
+        self,
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: Optional[torch.Tensor] = None,
+        reduction: Literal["none", "sum", "mean"] = "none",
+        ignore_index: int = -100,
+    ) -> torch.Tensor:
+        """Compute fused linear + cross-entropy over tensor-parallel vocab."""
+        assert self.config.cross_entropy_loss_fusion, "Cross-entropy loss fusion must be enabled."
+        assert self.config.cross_entropy_fusion_impl == "linear", (
+            "Cross-entropy loss fusion implementation must be 'linear' to use "
+            "_compute_linear_and_cross_entropy_loss."
+        )
+        assert weight is not None, "weight cannot be None when using fused linear cross entropy."
+        assert labels is not None, "labels cannot be None when using fused linear cross entropy."
+
+        # [b s] => [s b]
+        labels = labels.transpose(0, 1).contiguous()
+        loss = linear_cross_entropy(
+            hidden,
+            weight,
+            labels,
+            sequence_parallel=self.sequence_parallel,
+            reduction=reduction,
+            ignore_index=ignore_index,
+            tp_group=self.tp_group,
+        )
+        # If reduction != "none" this will be a scalar; for "none" it should
+        # match [s, b] and can be reshaped back to [b, s].
+        if reduction == "none":
+            loss = loss.view_as(labels).transpose(0, 1).contiguous()
+
+        return loss
diff --git a/megatron/core/transformer/mlp.py b/megatron/core/transformer/mlp.py
index 8a19fef87ec..99d8fd97dd9 100644
--- a/megatron/core/transformer/mlp.py
+++ b/megatron/core/transformer/mlp.py
@@ -27,6 +27,7 @@
 from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl, weighted_bias_swiglu_impl
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.transformer.utils import sharded_state_dict_default
 from megatron.core.typed_torch import apply_module, not_none
 from megatron.core.utils import (
     get_tensor_model_parallel_group_if_none,
@@ -352,7 +353,9 @@ def sharded_state_dict(
         sharded_state_dict = {}
         singleton_local_shards = (metadata or {}).get('singleton_local_shards', False)
         for name, module in self._modules.items():
-            sub_sd = module.sharded_state_dict(f"{prefix}{name}.", sharded_offsets, metadata)
+            sub_sd = sharded_state_dict_default(
+                module, f"{prefix}{name}.", sharded_offsets, metadata
+            )
             if self.config.gated_linear_unit and name == "linear_fc1":
                 for k, v in sub_sd.items():
                     if k in (f"{prefix}{name}.weight", f"{prefix}{name}.bias"):
diff --git a/megatron/core/transformer/module.py b/megatron/core/transformer/module.py
index 6539ee36105..2d588262676 100644
--- a/megatron/core/transformer/module.py
+++ b/megatron/core/transformer/module.py
@@ -322,6 +322,15 @@ def _get_te_cuda_graph_replay_args(self, *args, **kwargs):
 
         cudagraph_kwargs = kwargs.copy()
         cudagraph_kwargs['is_first_microbatch'] = getattr(self, 'current_microbatch', 0) == 0
+        if self.config.fine_grained_activation_offloading and getattr(
+            self, 'offload_module_in_cuda_graph', False
+        ):
+            from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+                FineGrainedActivationOffloadingInterface as off_interface,
+            )
+
+            cudagraph_kwargs['cuda_graph_stream'] = off_interface.cuda_graph_stream()
+            cudagraph_kwargs['cuda_graph_event'] = off_interface.cuda_graph_event()
         return cudagraph_args, cudagraph_kwargs
 
     def _should_call_local_cudagraph(self, *args, **kwargs):
diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py
index 34e9fb17a02..384a26c0deb 100644
--- a/megatron/core/transformer/moe/experts.py
+++ b/megatron/core/transformer/moe/experts.py
@@ -3,8 +3,11 @@
 
 import logging
 from collections.abc import Callable
+from contextlib import nullcontext
 from copy import deepcopy
 from dataclasses import dataclass
+from functools import partial
+from itertools import chain
 from math import ceil
 from typing import Optional, Protocol, Tuple
 
@@ -23,6 +26,12 @@
 from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
     FineGrainedActivationOffloadingInterface as off_interface,
 )
+from megatron.core.tensor_parallel.layers import (
+    _initialize_affine_weight_cpu,
+    _initialize_affine_weight_gpu,
+    set_tensor_model_parallel_attributes,
+)
+from megatron.core.tensor_parallel.utils import divide
 from megatron.core.transformer.mlp import (
     MLP,
     MLPSubmodules,
@@ -33,6 +42,12 @@
 from megatron.core.transformer.moe.moe_utils import (
     ProcessGroupCollection,
     get_align_size_for_quantization,
+    skip_routed_expert_padding,
+)
+from megatron.core.transformer.moe.paged_stash import (
+    get_paged_stash_context,
+    paged_stash_group_commit,
+    paged_stash_group_start,
 )
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import (
@@ -40,10 +55,14 @@
     sharded_state_dict_default,
 )
 from megatron.core.typed_torch import apply_module, not_none
+from megatron.core.utils import is_te_min_version
 
 if HAVE_TE:
+    import transformer_engine as te
+
     from megatron.core.extensions.transformer_engine import Fp8Padding, Fp8Unpadding
 else:
+    te = None  # type: ignore[assignment, misc]
     Fp8Padding, Fp8Unpadding = None, None
 
 try:
@@ -64,6 +83,469 @@
 logger = logging.getLogger(__name__)
 
 
+class GroupedMLP(MegatronModule):
+    """An efficient implementation of the Experts layer using GroupedGEMM.
+
+    Executes multiple experts in parallel to maximize computational efficiency.
+    """
+
+    # TODO(M4): breaking api, switched from pass in tp_group to pass in pg_collection.
+    def __init__(
+        self,
+        num_local_experts: int,
+        config: TransformerConfig,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+    ):
+        super().__init__(config=config)
+        self.config: TransformerConfig = config
+        self.num_local_experts = num_local_experts
+        gg.assert_grouped_gemm_is_available()
+        assert (
+            config.add_bias_linear == False
+        ), "bias not supported in Grouped GEMM yet, please set '--disable-bias-linear' instead."
+        assert (
+            config.moe_latent_size is None
+        ), "MoE latent projection not supported in GroupedMLP yet."
+
+        self.expert_parallel = config.expert_model_parallel_size > 1
+        if self.config.gated_linear_unit:
+            if self.config.activation_func not in (F.silu, F.gelu):
+                raise ValueError("Activation function must be silu or gelu when using GroupedMLP.")
+
+            @jit_fuser
+            def glu(x):
+                x = torch.chunk(x, 2, dim=-1)
+                return self.config.activation_func(x[0]) * x[1]
+
+            self.activation_func = glu
+        else:
+            self.activation_func = self.config.activation_func
+        self.activation_recompute = (
+            self.config.recompute_granularity == 'selective'
+            and "moe_act" in self.config.recompute_modules
+        )
+        if self.activation_recompute and (self.config.fp8 or self.config.fp4):
+            raise ValueError(
+                "moe_act recompute for fp8 or fp4 cannot work with the legacy GroupedMLP."
+            )
+
+        @jit_fuser
+        def activation_func_with_probs(x, probs):
+            dtype = x.dtype
+            res = self.activation_func(x) * probs
+            return res.to(dtype)
+
+        self.activation_func_with_probs = activation_func_with_probs
+
+        self.ep_group = pg_collection.ep
+        # use pg_collection.expt_tp_group as tensor parallel group in this module.
+        self.tp_group = pg_collection.expt_tp
+        # use pg_collection.expt_dp_group as data parallel group in this module.
+        self.dp_group = pg_collection.expt_dp
+        # How many feature each rank holds for fc1 and fc2, respectively.
+        tp_size = self.tp_group.size()
+        tp_rank = self.tp_group.rank()
+
+        fc1_output_size = self.config.moe_ffn_hidden_size * self.num_local_experts
+        if config.gated_linear_unit:
+            # Project to 4h. If using swiglu double the output width,
+            # see https://arxiv.org/pdf/2002.05202.pdf
+            fc1_output_size *= 2
+        fc1_output_size_per_partition = divide(fc1_output_size, tp_size)
+
+        fc2_input_size = self.config.moe_ffn_hidden_size * self.num_local_experts
+        fc2_input_size_per_partition = divide(fc2_input_size, tp_size)
+
+        # Note: The current kernel implementations of grouped_gemm
+        # does not support transposition with CUTLASS grouped GEMM
+        # (https://github.com/fanshiqing/grouped_gemm/blob/main/csrc/grouped_gemm.cu#L355-L358)
+        # and as a result we avoid allocate the transpose of weights.
+        # Initialize weight.
+        if config.use_cpu_initialization:
+            self.weight1 = Parameter(
+                torch.empty(
+                    self.config.hidden_size,
+                    fc1_output_size_per_partition,
+                    dtype=config.params_dtype,
+                )
+            )
+            self.weight2 = Parameter(
+                torch.empty(
+                    fc2_input_size_per_partition, self.config.hidden_size, dtype=config.params_dtype
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_cpu(
+                    self.weight1,
+                    self.config.hidden_size,
+                    fc1_output_size,
+                    fc1_output_size_per_partition,
+                    partition_dim=1,
+                    init_method=config.init_method,
+                    params_dtype=config.params_dtype,
+                    rank=tp_rank,
+                    world_size=tp_size,
+                )
+                _initialize_affine_weight_cpu(
+                    self.weight2,
+                    fc2_input_size,
+                    self.config.hidden_size,
+                    fc2_input_size_per_partition,
+                    partition_dim=0,
+                    init_method=config.output_layer_init_method,
+                    params_dtype=config.params_dtype,
+                    rank=tp_rank,
+                    world_size=tp_size,
+                )
+            else:
+                # Ensure TP attrs are set even when not initializing
+                set_tensor_model_parallel_attributes(
+                    tensor=self.weight1, is_parallel=True, dim=1, stride=1
+                )
+                set_tensor_model_parallel_attributes(
+                    tensor=self.weight2, is_parallel=True, dim=0, stride=1
+                )
+        else:
+            self.weight1 = Parameter(
+                torch.empty(
+                    self.config.hidden_size,
+                    fc1_output_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            self.weight2 = Parameter(
+                torch.empty(
+                    fc2_input_size_per_partition,
+                    self.config.hidden_size,
+                    device=torch.cuda.current_device(),
+                    dtype=config.params_dtype,
+                )
+            )
+            if config.perform_initialization:
+                _initialize_affine_weight_gpu(
+                    self.weight1, config.init_method, partition_dim=1, is_expert=True
+                )
+                _initialize_affine_weight_gpu(
+                    self.weight2, config.output_layer_init_method, partition_dim=0, is_expert=True
+                )
+            else:
+                # Ensure TP attrs are set even when not initializing
+                set_tensor_model_parallel_attributes(
+                    tensor=self.weight1, is_parallel=True, dim=1, stride=1
+                )
+                set_tensor_model_parallel_attributes(
+                    tensor=self.weight2, is_parallel=True, dim=0, stride=1
+                )
+        setattr(self.weight1, 'allreduce', not self.expert_parallel)
+        setattr(self.weight2, 'allreduce', not self.expert_parallel)
+
+        def remove_extra_states_check(self, incompatible_keys):
+            """
+            Remove _extra_state from unexpected keys.
+            These keys are for dist ckpt compatibility with SequentialMLP.
+            """
+            keys = deepcopy(incompatible_keys.unexpected_keys)
+            for key in keys:
+                if '_extra_state' in key:
+                    incompatible_keys.unexpected_keys.remove(key)
+
+        self.register_load_state_dict_post_hook(remove_extra_states_check)
+
+    def forward(
+        self,
+        permuted_local_hidden_states: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        permuted_probs: torch.Tensor,
+    ):
+        """Forward step of the GroupedMLP."""
+        assert self.config.bf16, "Currently GroupedMLP for MoE only supports bf16."
+        if self.activation_recompute:
+            self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput()
+
+        if self.config.moe_apply_probs_on_input:
+            assert (
+                self.config.moe_router_topk == 1
+            ), "`moe_apply_probs_on_input` only works with `moe_router_topk`=1."
+            original_dtype = permuted_local_hidden_states.dtype
+            permuted_local_hidden_states = (
+                permuted_probs.unsqueeze(-1) * permuted_local_hidden_states
+            )
+            permuted_local_hidden_states = permuted_local_hidden_states.to(original_dtype)
+            # Probs already applied, so reset to 1.
+            permuted_probs = torch.ones_like(permuted_probs)
+
+        if permuted_local_hidden_states.nelement() != 0:
+            # Reshape the weights for the grouped GEMMs.
+            w1 = self.weight1.view(self.num_local_experts, self.config.hidden_size, -1)
+            w2 = self.weight2.view(self.num_local_experts, -1, self.config.hidden_size)
+
+            fc1_output = gg.ops.gmm(
+                permuted_local_hidden_states, w1, tokens_per_expert, trans_b=False
+            )
+            if self.activation_recompute:
+                intermediate_parallel = self.activation_checkpoint.checkpoint(
+                    self.activation_func_with_probs, fc1_output, permuted_probs.unsqueeze(-1)
+                )
+                fc2_output = gg.ops.gmm(intermediate_parallel, w2, tokens_per_expert, trans_b=False)
+                self.activation_checkpoint.discard_output_and_register_recompute(fc2_output)
+            else:
+                intermediate_parallel = self.activation_func_with_probs(
+                    fc1_output, permuted_probs.unsqueeze(-1)
+                )
+                fc2_output = gg.ops.gmm(intermediate_parallel, w2, tokens_per_expert, trans_b=False)
+        else:
+            # No token is allocated for local experts.
+            assert torch.count_nonzero(tokens_per_expert) == 0
+
+            # Make sure params of experts still have gradients even given zero tokens.
+            w1 = self.weight1.view(self.config.hidden_size, -1)
+            w2 = self.weight2.view(-1, self.config.hidden_size)
+            h = torch.matmul(permuted_local_hidden_states, w1)
+            if self.activation_recompute:
+                h = self.activation_checkpoint.checkpoint(
+                    self.activation_func_with_probs, h, permuted_probs.unsqueeze(-1)
+                )
+                fc2_output = torch.matmul(h, w2)
+                self.activation_checkpoint.discard_output_and_register_recompute(fc2_output)
+            else:
+                h = self.activation_func_with_probs(h, permuted_probs.unsqueeze(-1))
+                fc2_output = torch.matmul(h, w2)
+
+        return fc2_output, None
+
+    def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata=None):
+        """
+        Maps local expert to global experts.
+        The sharded_state_dict for the weight parts are compatible with the SequentialMLP,
+        whereas the optimizer states are not due to the limitation from weight transposing.
+        That is, for finetuning scenario, the checkpoint is compatible with the SequentialMLP.
+
+        When `singleton_local_shards` metadata flag is True, experts are broken down into
+        separate tensors and stored under separate global keys. Additionally, similarly to MLP,
+        layers with GLU activations are broken down into separate `w` and `v` tensors.
+        """
+        singleton_local_shards = (metadata or {}).get('singleton_local_shards', False)
+        sharded_state_dict = {}
+        ep_size = self.ep_group.size()
+        ep_rank = self.ep_group.rank()
+        tp_size = self.tp_group.size()
+        tp_rank = self.tp_group.rank()
+        dp_rank = self.dp_group.rank()
+        num_global_experts = ep_size * self.num_local_experts
+        local_expert_indices_offset = ep_rank * self.num_local_experts
+
+        prepend_axis_num = len(sharded_offsets)
+        replica_id = (0, 0, dp_rank)
+
+        local_ffn_dim_size = (
+            self.weight2.numel() // self.num_local_experts // self.config.hidden_size
+        )
+
+        def _break_into_individual_experts(
+            experts_ten: torch.Tensor,
+            key: str,
+            tp_offset: Tuple[int, int, int],
+            replica_id: ReplicaId,
+        ):
+            """Breaks experts into individual tensors and stores them under separate global keys"""
+            experts_state = []
+            assert len(experts_ten) == self.num_local_experts, (
+                experts_ten.shape,
+                self.num_local_experts,
+            )
+            for local_expert_idx, expert_ten in enumerate(experts_ten):
+                global_expert_idx = local_expert_indices_offset + local_expert_idx
+                expert_key = key.replace(
+                    f'{prefix}experts.', f'{prefix}experts.{global_expert_idx}.'
+                )
+                experts_state.append(
+                    ShardedTensor.from_rank_offsets(
+                        expert_key,
+                        expert_ten.contiguous(),
+                        *sharded_offsets,
+                        tp_offset,
+                        replica_id=replica_id,
+                        prepend_axis_num=prepend_axis_num,
+                    )
+                )
+            return experts_state
+
+        @torch.no_grad()
+        def sh_ten_build_fn(
+            key: str,
+            t: torch.Tensor,
+            replica_id: ReplicaId,
+            flattened_range: Optional[slice],
+            tp_axis: int,
+            with_glu: bool,
+        ):
+            # TODO: write a generic implementation to cover both cases with and without GLU
+            if tp_axis == 1:
+                # weight1
+                if with_glu:
+                    last_dim_size = local_ffn_dim_size * 2
+                else:
+                    last_dim_size = local_ffn_dim_size
+                real_shape = (self.num_local_experts, self.config.hidden_size, last_dim_size)
+            elif tp_axis == 0:
+                # weight2
+                real_shape = (self.num_local_experts, local_ffn_dim_size, self.config.hidden_size)
+                assert with_glu == False
+            else:
+                raise ValueError("tp_axis should be 0 or 1.")
+            if flattened_range is None:
+                # weights
+                t = t.view(real_shape).transpose(-1, -2)
+                # change tp_axis due to the transposing
+                tp_axis = 1 - tp_axis
+                if with_glu:
+                    assert tp_axis == 0, tp_axis
+                    if singleton_local_shards:
+                        w_tensor, v_tensor = torch.chunk(t, 2, -2)
+                        w_key = f'{key}_w'
+                        v_key = f'{key}_v'
+                        sub_states = {
+                            'singleton_local_shards': LocalNonpersistentObject(True),
+                            'data': {
+                                'w': _break_into_individual_experts(
+                                    w_tensor,
+                                    w_key,
+                                    (prepend_axis_num, tp_rank, tp_size),
+                                    replica_id,
+                                ),
+                                'v': _break_into_individual_experts(
+                                    v_tensor,
+                                    v_key,
+                                    (prepend_axis_num, tp_rank, tp_size),
+                                    replica_id,
+                                ),
+                            },
+                        }
+                    else:
+                        local_tensors = torch.chunk(t, 2, -2)
+                        sub_states = [
+                            ShardedTensor.from_rank_offsets(
+                                key,
+                                local_tensors[0].contiguous(),
+                                *sharded_offsets,
+                                (prepend_axis_num, ep_rank, ep_size),
+                                (prepend_axis_num + 1, tp_rank, tp_size * 2),
+                                replica_id=replica_id,
+                                prepend_axis_num=prepend_axis_num,
+                            ),
+                            ShardedTensor.from_rank_offsets(
+                                key,
+                                local_tensors[1].contiguous(),
+                                *sharded_offsets,
+                                (prepend_axis_num, ep_rank, ep_size),
+                                (prepend_axis_num + 1, tp_size + tp_rank, tp_size * 2),
+                                replica_id=replica_id,
+                                prepend_axis_num=prepend_axis_num,
+                            ),
+                        ]
+                else:
+                    if singleton_local_shards:
+                        sub_states = {
+                            'singleton_local_shards': LocalNonpersistentObject(True),
+                            'data': _break_into_individual_experts(
+                                t, key, (prepend_axis_num + tp_axis, tp_rank, tp_size), replica_id
+                            ),
+                        }
+                    else:
+                        sub_states = ShardedTensor.from_rank_offsets(
+                            key,
+                            t.contiguous(),
+                            *sharded_offsets,
+                            (prepend_axis_num, ep_rank, ep_size),
+                            (prepend_axis_num + 1 + tp_axis, tp_rank, tp_size),
+                            replica_id=replica_id,
+                            prepend_axis_num=prepend_axis_num,
+                        )
+            return sub_states  # pylint: disable=possibly-used-before-assignment
+
+        @torch.no_grad()
+        def sh_ten_merge_fn(sub_state_dict, tp_axis: int, with_glu: bool):
+            if tp_axis == 1:
+                # weight1
+                weight_shape = (self.config.hidden_size, -1)
+            elif tp_axis == 0:
+                # weight2
+                weight_shape = (-1, self.config.hidden_size)
+                assert with_glu == False
+            else:
+                raise ValueError("tp_axis should be 0 or 1.")
+            if isinstance(sub_state_dict, dict):
+                assert sub_state_dict['singleton_local_shards']
+                if with_glu:
+                    assert isinstance(sub_state_dict['data'], dict)
+                    sub_state_dict = torch.cat(
+                        (
+                            torch.stack(sub_state_dict['data']['w']),
+                            torch.stack(sub_state_dict['data']['v']),
+                        ),
+                        dim=-2,
+                    )
+                else:
+                    assert isinstance(sub_state_dict['data'], list)
+                    sub_state_dict = torch.stack(sub_state_dict['data'])
+            else:
+                if with_glu:
+                    sub_state_dict = torch.cat(sub_state_dict, -2)
+            return sub_state_dict.transpose(-1, -2).reshape(weight_shape)
+
+        state_dict = self.state_dict(prefix='', keep_vars=True)
+        for name, tensor in state_dict.items():
+            if name == 'weight1':
+                tp_axis = 1
+                with_glu = self.config.gated_linear_unit
+                wkey = f'{prefix}experts.linear_fc1.weight'
+            else:
+                tp_axis = 0
+                with_glu = False
+                wkey = f'{prefix}experts.linear_fc2.weight'
+
+            this_replica_id = list(copy.deepcopy(replica_id))
+
+            sharded_state_dict[f'{prefix}{name}'] = ShardedTensorFactory(
+                wkey,
+                tensor,
+                partial(sh_ten_build_fn, tp_axis=tp_axis, with_glu=with_glu),
+                partial(sh_ten_merge_fn, tp_axis=tp_axis, with_glu=with_glu),
+                tuple(this_replica_id),
+            )
+
+        replica_id = (0, tp_rank, dp_rank)
+        # Add fake _extra_state to be compatible with SequentialMLP
+        for expert_local_idx in range(self.num_local_experts):
+            expert_global_idx = local_expert_indices_offset + expert_local_idx
+            if singleton_local_shards:
+                expert_sharded_offsets = sharded_offsets
+            else:
+                expert_sharded_offsets = (
+                    *sharded_offsets,
+                    (len(sharded_offsets), expert_global_idx, num_global_experts),
+                )
+            for mod in ['linear_fc1', 'linear_fc2']:
+                if singleton_local_shards:
+                    expert_key = f'{prefix}experts.{expert_global_idx}.{mod}._extra_state'
+                else:
+                    expert_key = f'{prefix}experts.{mod}._extra_state'
+                sharded_state_dict[f'{prefix}expert{expert_global_idx}.{mod}._extra_state'] = (
+                    make_sharded_object_for_checkpoint(
+                        None, expert_key, expert_sharded_offsets, replica_id
+                    )
+                )
+
+        return sharded_state_dict
+
+    def backward_dw(self):
+        """Performs backward pass for weight gradients in Experts.
+        Empty implementation for compatibility with SequentialMLP and TEGroupedMLP.
+        """
+        pass
+
+
 class GroupedLinearFc1Interface(Protocol):
     """Interface for linear_fc1 module in TEGroupedMLP."""
 
@@ -237,15 +719,37 @@ def __init__(
             set_save_original_input(self.linear_fc2)
 
         # This is to avoid the CPU overhead of multiple d2h copies
-        if self.offload_expert_fc1:
+        if self.offload_expert_fc1 and not self.config.fp8:
             from megatron.core.extensions.transformer_engine import set_save_original_input
 
             set_save_original_input(self.linear_fc1)
 
+        # Fused implementation with Transformer Engine op fuser API
+        if self.config.use_transformer_engine_op_fuser:
+            assert (
+                self._is_fused_impl_supported()
+            ), "Fused GroupedMLP is not supported for this configuration."
+        self._with_fused_impl: bool = self.config.use_transformer_engine_op_fuser
+        self._fused_ops: Optional[Tuple[torch.nn.Module]] = None
+        if (
+            self.config.gated_linear_unit
+            and self.config.moe_mlp_glu_interleave_size is not None
+            and not self._with_fused_impl
+        ):
+            logger.warning(
+                "`moe_mlp_glu_interleave_size=%s` is enabled, but fused MoE MLP implementation "
+                "is not supported for this configuration. The non-fused path may incur extra "
+                "tensor reordering/copy overhead each forward pass.",
+                self.config.moe_mlp_glu_interleave_size,
+            )
+
         if self.config.fp8 or self.config.fp4:
             assert HAVE_TE, "FP8 and FP4 requires TE."
-            self.quantization_padding = Fp8Padding(self.num_local_experts)
-            self.quantization_unpadding = Fp8Unpadding(self.num_local_experts)
+            align_size = 256 if self._with_fused_impl else None
+            self.quantization_padding = Fp8Padding(self.num_local_experts, align_size=align_size)
+            self.quantization_unpadding = Fp8Unpadding(
+                self.num_local_experts, align_size=align_size
+            )
 
     @staticmethod
     def _apply_bias(intermediate_parallel, bias_parallel, tokens_per_expert, permuted_probs):
@@ -267,6 +771,269 @@ def _apply_bias(intermediate_parallel, bias_parallel, tokens_per_expert, permute
             .to(intermediate_parallel.dtype)
         )
 
+    def _is_fused_impl_supported(self) -> bool:
+        """Check if the TE op fuser supports implementing this module.
+
+        Logs a warning for each unsatisfied condition to aid debugging
+        (e.g. when CUDA graph fails because the CuTe DSL fused kernel
+        was not activated and GroupedLinear falls back to tolist()).
+        """
+
+        def _unsupported(reason):
+            logger.warning("TE fused GroupedMLP not available: %s", reason)
+            return False
+
+        # Check Transformer Engine installation
+        if not HAVE_TE:
+            return _unsupported("Transformer Engine is not installed")
+        try:
+            from transformer_engine.pytorch.ops import GroupedLinear, ScaledSwiGLU
+        except ImportError:
+            return _unsupported("TE too old (missing pytorch.ops.GroupedLinear)")
+
+        if not is_te_min_version("2.14.0"):
+            return _unsupported("TE version < 2.14.0")
+
+        # Check for unsupported features
+        if self.tp_group.size() > 1:
+            return _unsupported(f"expert TP > 1 (tp_size={self.tp_group.size()})")
+        if self.offload_expert_fc1 or self.offload_moe_act:
+            return _unsupported("fine-grained activation offloading enabled")
+        if self.config.moe_apply_probs_on_input:
+            return _unsupported("moe_apply_probs_on_input enabled")
+
+        # Check grouped linear modules
+        if not isinstance(self.linear_fc1, te.pytorch.GroupedLinear):
+            return _unsupported(f"linear_fc1 is {type(self.linear_fc1).__name__}")
+        if not isinstance(self.linear_fc2, te.pytorch.GroupedLinear):
+            return _unsupported(f"linear_fc2 is {type(self.linear_fc2).__name__}")
+
+        # Check activation: SwiGLU or quick GEGLU (ScaledClampedQGeGLU, TE >= 2.15)
+        # Use config.activation_func instead of self.activation_func because when
+        # use_te_activation_func is True, self.activation_func is a TE module, not the raw function.
+        if not self.config.gated_linear_unit:
+            return _unsupported("gated_linear_unit not enabled")
+        if self.config.activation_func == F.silu:
+            pass  # SwiGLU — supported
+        elif self.config.activation_func == quick_gelu:
+            try:
+                from transformer_engine.pytorch.ops import ScaledClampedQGeGLU  # noqa: F401
+            except ImportError:
+                return _unsupported("quick_gelu needs TE >= 2.15")
+        else:
+            return _unsupported(f"unsupported activation: {self.config.activation_func}")
+
+        # Check TE CuTe DSL fused kernel conditions (must match TE's
+        # fuse_grouped_mlp_ops matching logic)
+        import os
+
+        if os.environ.get("NVTE_CUTEDSL_FUSED_GROUPED_MLP", "0") == "0":
+            return _unsupported(
+                "NVTE_CUTEDSL_FUSED_GROUPED_MLP not set — CuTe DSL fused kernel disabled"
+            )
+        if self.config.moe_mlp_glu_interleave_size != 32:
+            return _unsupported(
+                f"moe_mlp_glu_interleave_size={self.config.moe_mlp_glu_interleave_size} "
+                f"(CuTe DSL requires 32)"
+            )
+
+        return True
+
+    def _make_fused_ops(self) -> torch.nn.Module:
+        """Construct fused module for FC1, activation, and FC2."""
+
+        # Container for fusible ops
+        ops = te.pytorch.ops.Sequential()
+
+        # Check if there are 1 or "num_gemms" params in the GroupedLinear module.
+        fc1_single_grouped_weight = self.linear_fc1.single_grouped_weight
+        fc1_weight_dtype = (
+            self.linear_fc1.weight.dtype
+            if fc1_single_grouped_weight
+            else self.linear_fc1.weight0.dtype
+        )
+        fc2_single_grouped_weight = self.linear_fc2.single_grouped_weight
+        fc2_weight_dtype = (
+            self.linear_fc2.weight.dtype
+            if fc2_single_grouped_weight
+            else self.linear_fc2.weight0.dtype
+        )
+        fc1_single_grouped_bias = self.linear_fc1.single_grouped_bias
+        fc2_single_grouped_bias = self.linear_fc2.single_grouped_bias
+
+        # TODO:ksivamani: Why meta device?
+        op = te.pytorch.ops.GroupedLinear(
+            self.linear_fc1.num_gemms,
+            self.linear_fc1.in_features,
+            self.linear_fc1.out_features,
+            bias=self.linear_fc1.use_bias,
+            device=torch.cuda.current_device(),
+            dtype=fc1_weight_dtype,
+            accumulate_into_main_grad=self.linear_fc1.fuse_wgrad_accumulation,
+            single_grouped_weight=fc1_single_grouped_weight,
+            single_grouped_bias=fc1_single_grouped_bias,
+            delay_wgrad_compute=self.config.delay_wgrad_compute,
+        )
+
+        # Copy the weights from GroupedLinear module to GroupedLinear op.
+        if fc1_single_grouped_weight:
+            setattr(op, "weight", getattr(self.linear_fc1, "weight"))
+
+        for idx in range(self.linear_fc1.num_gemms):
+            if not fc1_single_grouped_weight:
+                setattr(op, f"weight{idx}", getattr(self.linear_fc1, f"weight{idx}"))
+            if self.linear_fc1.use_bias and not fc1_single_grouped_bias:
+                setattr(op, f"bias{idx}", getattr(self.linear_fc1, f"bias{idx}"))
+        if self.linear_fc1.use_bias and fc1_single_grouped_bias:
+            setattr(op, "bias", getattr(self.linear_fc1, "bias"))
+        ops.append(op)
+
+        # Activation and post-multiply probs (SwiGLU or clamped quick-GEGL)
+        glu_interleave = self.config.moe_mlp_glu_interleave_size
+        if self.config.activation_func == F.silu and self.config.gated_linear_unit:
+            op = te.pytorch.ops.ScaledSwiGLU(glu_interleave_size=glu_interleave)
+        elif self.config.activation_func == quick_gelu and self.config.gated_linear_unit:
+            clamp = self.config.activation_func_clamp_value
+            if clamp is not None:
+                op = te.pytorch.ops.ScaledClampedQGeGLU(
+                    glu_interleave_size=glu_interleave, limit=clamp
+                )
+            else:
+                op = te.pytorch.ops.ScaledClampedQGeGLU(glu_interleave_size=glu_interleave)
+        else:
+            raise RuntimeError(
+                "_make_fused_ops expected SwiGLU or quick_gelu with gated_linear_unit; "
+                "call _is_fused_impl_supported() before constructing fused ops."
+            )
+        ops.append(op)
+
+        # FC2
+        op = te.pytorch.ops.GroupedLinear(
+            self.linear_fc2.num_gemms,
+            self.linear_fc2.in_features,
+            self.linear_fc2.out_features,
+            bias=self.linear_fc2.use_bias,
+            device=torch.cuda.current_device(),
+            dtype=fc2_weight_dtype,
+            accumulate_into_main_grad=self.linear_fc2.fuse_wgrad_accumulation,
+            single_grouped_weight=fc2_single_grouped_weight,
+            single_grouped_bias=fc2_single_grouped_bias,
+            delay_wgrad_compute=self.config.delay_wgrad_compute,
+        )
+
+        # Copy the weights from GroupedLinear module to GroupedLinear op.
+        if fc2_single_grouped_weight:
+            setattr(op, "weight", getattr(self.linear_fc2, "weight"))
+
+        for idx in range(self.linear_fc2.num_gemms):
+            if not fc2_single_grouped_weight:
+                setattr(op, f"weight{idx}", getattr(self.linear_fc2, f"weight{idx}"))
+            if self.linear_fc2.use_bias and not fc2_single_grouped_bias:
+                setattr(op, f"bias{idx}", getattr(self.linear_fc2, f"bias{idx}"))
+        if self.linear_fc2.use_bias and fc2_single_grouped_bias:
+            setattr(op, "bias", getattr(self.linear_fc2, "bias"))
+        ops.append(op)
+
+        # Emulate submodule pre-forward hooks
+        ops.register_forward_pre_hook(self._make_fused_impl_pre_forward_hook())
+
+        return ops
+
+    def _make_fused_impl_pre_forward_hook(self) -> Callable:
+        """Make function that calls submodule pre-forward callback hooks.
+
+        This is intended for compatibility with
+        DistributedDataParallel hooks that trigger parameter
+        all-gathers. It does not support general pre-forward hooks
+        since they may manipulate intermediate tensors that are never
+        instantiated by the fused implementation.
+
+        """
+
+        def forward_pre_hook(module, *_) -> None:
+            for submodule in chain(self.linear_fc1.modules(), self.linear_fc2.modules()):
+                for hook in submodule._forward_pre_hooks.values():
+                    # Assume that hook does not interact with input
+                    ret = hook(submodule, None)
+                    if ret is not None:
+                        raise RuntimeError(
+                            f"Applying a fused implementation for {self.__class__.__name__}, "
+                            f"but a {submodule.__class__.__name__} submodule "
+                            "has a pre-forward hook that modifies the input tensor."
+                        )
+
+        return forward_pre_hook
+
+    def _fused_forward(
+        self,
+        permuted_local_hidden_states: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        permuted_probs: torch.Tensor,
+    ) -> torch.Tensor:
+        """Forward pass using Transformer Engine operation fuser API."""
+
+        # Construct fused impl if needed
+        # Note: We initialize during the first forward pass in case
+        # the params are modified after the constructor.
+        # Note: The fused impl is stored in a tuple to avoid
+        # registering submodules.
+        if self._fused_ops is None:
+            self._fused_ops = (self._make_fused_ops(),)
+        (ops,) = self._fused_ops
+
+        # Apply padding if needed
+        unpadded_tokens_per_expert = None
+        if skip_routed_expert_padding(self.config):
+            pass
+        elif self.config.fp8 or self.config.fp4:
+            tokens_per_expert = tokens_per_expert.tolist()
+            unpadded_tokens_per_expert = tokens_per_expert
+            permuted_local_hidden_states, tokens_per_expert = self.quantization_padding(
+                permuted_local_hidden_states, tokens_per_expert
+            )
+            permuted_probs, _ = self.quantization_padding(
+                permuted_probs.unsqueeze(-1), unpadded_tokens_per_expert
+            )
+            permuted_probs = permuted_probs.squeeze(-1)
+            tokens_per_expert = torch.tensor(
+                tokens_per_expert, dtype=torch.int, device=permuted_probs.device
+            )
+        # if the number of tokens is 0, pad the hidden states to 256
+
+        if self.config.moe_paged_stash:
+            permuted_local_hidden_states = paged_stash_group_start(permuted_local_hidden_states)
+            max_num_tokens = permuted_local_hidden_states.shape[0]
+            # Average/expected tokens is a pre-padding estimate used by paged stashing heuristics.
+            # moe_expert_rank_capacity_factor is required when moe_paged_stash is enabled.
+            cap_factor = self.config.moe_expert_rank_capacity_factor
+            avg_num_tokens = (
+                int(max_num_tokens // cap_factor)
+                if cap_factor is not None and cap_factor > 0
+                else None
+            )
+            stash_context = get_paged_stash_context(
+                name="grouped_mlp",
+                max_num_tokens=max_num_tokens,
+                num_tokens_tensor=tokens_per_expert.sum(),
+                avg_num_tokens=avg_num_tokens,
+            )
+        else:
+            stash_context = nullcontext()
+        with stash_context:
+            # Call fused impl
+            output = ops(
+                permuted_local_hidden_states,
+                tokens_per_expert,  # FC1
+                permuted_probs,  # Scaled SwiGLU
+                tokens_per_expert,  # FC2
+            )
+        # Remove padding if needed
+        if unpadded_tokens_per_expert is not None:
+            output = self.quantization_unpadding(output, unpadded_tokens_per_expert)
+        if self.config.moe_paged_stash:
+            output = paged_stash_group_commit(output, name="grouped_mlp")
+        return output
+
     def bias_act_func(self, intermediate_parallel, bias_parallel, permuted_probs):
         """
         Applies bias and activation function to the output of linear_fc1.
@@ -341,17 +1108,29 @@ def forward(
         Return:
             output (torch.Tensor): The output of the local experts.
         """
+
+        # Call fused impl if enabled
+        if self._with_fused_impl:
+            output = self._fused_forward(
+                permuted_local_hidden_states, tokens_per_expert, permuted_probs
+            )
+            output_bias = None
+            return output, output_bias
+
+        # Apply padding if needed
+        unpadded_tokens_per_expert = None
         tokens_per_expert: list[int] = tokens_per_expert.tolist()
-        if self.config.fp8 or self.config.fp4:
-            actual_tokens_per_expert = tokens_per_expert
+        permuted_probs = permuted_probs.unsqueeze(-1)
+        if skip_routed_expert_padding(self.config):
+            pass
+        elif self.config.fp8 or self.config.fp4:
+            unpadded_tokens_per_expert = tokens_per_expert
             permuted_local_hidden_states, tokens_per_expert = self.quantization_padding(
                 permuted_local_hidden_states, tokens_per_expert
             )
             permuted_probs, _ = self.quantization_padding(
-                permuted_probs.unsqueeze(-1), actual_tokens_per_expert
+                permuted_probs, unpadded_tokens_per_expert
             )
-        else:
-            permuted_probs = permuted_probs.unsqueeze(-1)
 
         if self.config.moe_apply_probs_on_input:
             assert (
@@ -363,43 +1142,130 @@ def forward(
             # Probs already applied, so reset to 1.
             permuted_probs = torch.ones_like(permuted_probs)
 
-        with off_interface(
+        expert_fc1_manager = off_interface(
             self.offload_expert_fc1, permuted_local_hidden_states, "expert_fc1"
-        ) as permuted_local_hidden_states:
+        )
+        with expert_fc1_manager as permuted_local_hidden_states:
             fc1_output, bias_parallel = apply_module(self.linear_fc1)(
                 permuted_local_hidden_states, tokens_per_expert
             )
-        if self.offload_expert_fc1:
-            fc1_output = off_interface.group_commit(
-                fc1_output,
-                name="expert_fc1",
-                forced_released_tensors=[permuted_local_hidden_states],
+        fc1_output = expert_fc1_manager.group_offload(
+            fc1_output,
+            forced_released_tensors=[permuted_local_hidden_states],
+            delay_offload=self.config.delay_offload_until_cuda_graph,
+        )
+
+        def bias_act_func(intermediate_parallel, bias_parallel, permuted_probs):
+
+            # Whether activation function is interleaved GLU
+            with_glu_interleaving = (
+                self.config.gated_linear_unit
+                and self.config.moe_mlp_glu_interleave_size is not None
             )
 
+            def remove_glu_interleaving(x: torch.Tensor) -> torch.Tensor:
+                """Reorder tensor so gate and linear units are contiguous.
+
+                Should only be applied if the activation function is
+                an interleaved GLU.
+
+                """
+                shape = x.size()
+                interleave_size = self.config.moe_mlp_glu_interleave_size
+                x = x.reshape(-1, shape[-1] // (2 * interleave_size), 2, interleave_size)
+                x = x.transpose(1, 2).contiguous()
+                x = x.view(shape)
+                return x
+
+            if self.config.use_te_activation_func:
+                if bias_parallel is not None:
+                    intermediate_parallel = intermediate_parallel + bias_parallel
+                if with_glu_interleaving:
+                    intermediate_parallel = remove_glu_interleaving(intermediate_parallel)
+                intermediate_parallel = self.activation_func(intermediate_parallel)
+                if permuted_probs is not None:
+                    original_dtype = intermediate_parallel.dtype
+                    intermediate_parallel = intermediate_parallel * permuted_probs
+                    intermediate_parallel = intermediate_parallel.to(original_dtype)
+            elif self.config.bias_activation_fusion and not with_glu_interleaving:
+                if self.activation_func == F.silu and self.config.gated_linear_unit:
+                    # dtype is handled inside the fused kernel
+                    intermediate_parallel = weighted_bias_swiglu_impl(
+                        intermediate_parallel,
+                        bias_parallel,
+                        permuted_probs,
+                        self.config.activation_func_fp8_input_store,
+                        self.config.activation_func_clamp_value,
+                    )
+                elif self.activation_func == quick_gelu and self.config.gated_linear_unit:
+                    intermediate_parallel = weighted_bias_quick_geglu_impl(
+                        intermediate_parallel,
+                        bias_parallel,
+                        permuted_probs,
+                        self.config.activation_func_fp8_input_store,
+                        self.config.glu_linear_offset,
+                        self.config.activation_func_clamp_value,
+                    )
+                else:
+                    raise ValueError(
+                        "Only support fusion of swiglu and quick_gelu in TEGroupedMLP."
+                    )
+            elif (
+                self.activation_func == squared_relu and self.config.use_fused_weighted_squared_relu
+            ):
+                assert bias_parallel is None
+                intermediate_parallel = weighted_squared_relu_impl(
+                    intermediate_parallel, permuted_probs
+                )
+            else:
+                if self.config.gated_linear_unit:
+
+                    def glu(x):
+                        if with_glu_interleaving:
+                            x = remove_glu_interleaving(x)
+                        x_glu, x_linear = torch.chunk(x, 2, dim=-1)
+                        if (val := self.config.activation_func_clamp_value) is not None:
+                            x_glu = x_glu.clamp(min=None, max=val)
+                            x_linear = x_linear.clamp(min=-val, max=val)
+                        return self.config.activation_func(x_glu) * (
+                            x_linear + self.config.glu_linear_offset
+                        )
+
+                    intermediate_parallel = glu(intermediate_parallel)
+                else:
+                    intermediate_parallel = self.activation_func(intermediate_parallel)
+                original_dtype = intermediate_parallel.dtype
+                intermediate_parallel = intermediate_parallel * permuted_probs
+                intermediate_parallel = intermediate_parallel.to(original_dtype)
+            return intermediate_parallel
+
+        moe_act_manager = off_interface(self.offload_moe_act, fc1_output, "moe_act")
         if self.activation_recompute:
             self.activation_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-            with off_interface(self.offload_moe_act, fc1_output, "moe_act") as fc1_output:
+            with moe_act_manager as fc1_output:
                 bias_act_output = self.activation_checkpoint.checkpoint(
                     self.bias_act_func, fc1_output, bias_parallel, permuted_probs
                 )
         else:
-            with off_interface(self.offload_moe_act, fc1_output, "moe_act") as fc1_output:
-                bias_act_output = self.bias_act_func(fc1_output, bias_parallel, permuted_probs)
+            with moe_act_manager as fc1_output:
+                bias_act_output = bias_act_func(fc1_output, bias_parallel, permuted_probs)
+
         output, output_bias = apply_module(self.linear_fc2)(bias_act_output, tokens_per_expert)
         if self.activation_recompute:
             self.activation_checkpoint.discard_output_and_register_recompute(output)
 
         # Delay the offload of the moe act until after the linear_fc2 has been computed
         # to make sure the fc1_output is reloaded to GPU before recomputing moe_act.
-        if self.offload_moe_act:
-            output = off_interface.group_commit(
-                output, name="moe_act", forced_released_tensors=[fc1_output]
-            )
+        output = moe_act_manager.group_offload(
+            output,
+            forced_released_tensors=[fc1_output],
+            delay_offload=self.config.delay_offload_until_cuda_graph,
+        )
         output = self._apply_bias(output, output_bias, tokens_per_expert, permuted_probs)
 
         # upad and concat the output
-        if self.config.fp8 or self.config.fp4:
-            output = self.quantization_unpadding(output, actual_tokens_per_expert)
+        if unpadded_tokens_per_expert is not None:
+            output = self.quantization_unpadding(output, unpadded_tokens_per_expert)
 
         output_bias = None
 
@@ -453,6 +1319,27 @@ def backward_dw(self):
         If an error occurs during execution, it is caught and re-raised with a
         descriptive message.
         """
+        if self._with_fused_impl and self.config.delay_wgrad_compute:
+            if self._fused_ops is not None:
+                (seq,) = self._fused_ops
+                fused_children = list(seq.children())
+                assert len(fused_children) >= 3, "expected FC1, activation, FC2 in fused TE ops"
+                fused_children[2].backward_dw()
+                fused_children[0].backward_dw()
+                # DDP registers wgrad hooks on the original linear_fc1/fc2 module objects
+                # (those are in the nn.Module tree), but backward_dw() is called on the
+                # NEW GroupedLinear instances created by _make_fused_ops().  We must
+                # explicitly fire the hooks on the originals so DDP can zero param.grad
+                # and trigger reduce-scatter – otherwise param.grad is never cleared and
+                # AccumulateGrad performs a spurious add_ into main_grad.
+                # TODO: find a better place to invoke _trigger_wgrad_accumulation_and_reduce_hooks.
+                # The wgrad hook registration lives in TE while the trigger is issued here
+                # in MCore, so the hook lifecycle is split across both codebases. Consolidate
+                # ownership on one side (either register+trigger entirely in TE, or expose
+                # the fused backward_dw through MCore) to remove this fragmentation.
+                self.linear_fc2._trigger_wgrad_accumulation_and_reduce_hooks()
+                self.linear_fc1._trigger_wgrad_accumulation_and_reduce_hooks()
+            return
         self.linear_fc2.backward_dw()
         self.linear_fc1.backward_dw()
 
diff --git a/megatron/core/transformer/moe/fused_a2a.py b/megatron/core/transformer/moe/fused_a2a.py
index 39f50a4a670..07f33deca6c 100644
--- a/megatron/core/transformer/moe/fused_a2a.py
+++ b/megatron/core/transformer/moe/fused_a2a.py
@@ -3,6 +3,8 @@
 # Copyright (c) 2025 DeepSeek
 # Licensed under the MIT License - https://github.com/deepseek-ai/DeepEP/blob/main/LICENSE
 
+from typing import Optional
+
 from megatron.core.utils import internal_api
 
 try:
@@ -280,9 +282,12 @@ def init_hybrid_ep_buffer(
     hidden_dim: int,
     seq_len: int,
     num_local_experts: int,
-    num_sms_dispatch_api: int,
-    num_sms_combine_api: int,
-    fp8_dispatch: bool,
+    num_sms_dispatch_api: Optional[int] = None,
+    num_sms_combine_api: Optional[int] = None,
+    num_blocks_permute: Optional[int] = None,
+    num_blocks_unpermute: Optional[int] = None,
+    fp8_dispatch: bool = False,
+    num_sms_preprocessing_api: Optional[int] = None,
 ) -> None:
     '''
     Initialize the HybridEP buffer, including buffer allocation and metadata
@@ -301,23 +306,39 @@ def init_hybrid_ep_buffer(
             Maximum sequence length of the input tensor.
         num_local_experts (int):
             Number of local experts.
-        num_sms_dispatch_api (int):
+        num_sms_dispatch_api (Optional[int]):
             Number of SMs used by the dispatch API.
-        num_sms_combine_api (int):
+        num_sms_combine_api (Optional[int]):
             Number of SMs used by the combine API.
+        num_blocks_permute (Optional[int]):
+            Number of blocks used by the permute part.
+        num_blocks_unpermute (Optional[int]):
+            Number of blocks used by the unpermute part.
         fp8_dispatch (bool):
             Whether to use FP8 communication during the dispatch phase.
+        num_sms_preprocessing_api (Optional[int]):
+            Number of SMs used by the preprocessing (metadata scan) kernel.
     '''
     assert not fp8_dispatch, "HybridEP dispatcher does not support fp8 dispatch now"
     global _hybrid_ep_buffer
+    kwargs = {}
+    if num_sms_dispatch_api is not None:
+        kwargs['num_sms_dispatch_api'] = num_sms_dispatch_api
+    if num_sms_combine_api is not None:
+        kwargs['num_sms_combine_api'] = num_sms_combine_api
+    if num_blocks_permute is not None:
+        kwargs['num_blocks_permute'] = num_blocks_permute
+    if num_blocks_unpermute is not None:
+        kwargs['num_blocks_unpermute'] = num_blocks_unpermute
+    if num_sms_preprocessing_api is not None:
+        kwargs['num_sms_preprocessing_api'] = num_sms_preprocessing_api
     _hybrid_ep_buffer = HybridEPBuffer(
         group=group,
         hidden_dim=hidden_dim,
         max_num_of_tokens_per_rank=seq_len,
         num_local_experts=num_local_experts,
         use_fp8=fp8_dispatch,
-        num_sms_dispatch_api=num_sms_dispatch_api,
-        num_sms_combine_api=num_sms_combine_api,
+        **kwargs,
     )
 
 
@@ -342,14 +363,35 @@ def forward(
         probs,
         group,
         num_local_experts,
-        num_sms_dispatch_api=24,
-        num_sms_combine_api=24,
+        num_sms_dispatch_api=None,
+        num_sms_combine_api=None,
+        num_blocks_permute=None,
+        num_blocks_unpermute=None,
+        fused=False,
         num_permuted_tokens=None,
         pad_multiple=None,
+        num_sms_preprocessing_api=108,
     ):
         '''
         Forward pass of fused dispatch of the HybridEP backend
         '''
+        if fused or num_blocks_permute is not None or num_blocks_unpermute is not None:
+            import inspect
+            import warnings
+
+            sig = inspect.signature(HybridEPBuffer.dispatch_with_permute)
+            if 'fuse_permute_dispatch' not in sig.parameters:
+                warnings.warn(
+                    "Current DeepEP version does not support fused permute dispatch or "
+                    "num_blocks_permute/num_blocks_unpermute. Falling back to unfused "
+                    "HybridEP dispatch.",
+                    UserWarning,
+                    stacklevel=2,
+                )
+                fused = False
+                num_blocks_permute = None
+                num_blocks_unpermute = None
+
         if _hybrid_ep_buffer is None:
             seq_len, hidden_dim = x.shape[-2:]
             fp8_dispatch = False  # Currently, we do not support fp8 dispatch
@@ -360,7 +402,10 @@ def forward(
                 num_local_experts,
                 num_sms_dispatch_api,
                 num_sms_combine_api,
+                num_blocks_permute,
+                num_blocks_unpermute,
                 fp8_dispatch,
+                num_sms_preprocessing_api,
             )
         # If we provide the num_permuted_tokens, we do not need to use sync to
         # wait for the data in pinned memory ready
@@ -381,10 +426,12 @@ def forward(
             pad_multiple=pad_multiple,
             num_permuted_tokens=num_permuted_tokens,
             non_blocking=non_blocking,
+            **({"fuse_permute_dispatch": fused} if fused else {}),
         )
 
         ctx.handle = handle
         ctx.pad_multiple = pad_multiple
+        ctx.fused = fused
         return (
             dispatched_hidden,
             dispatched_probs,
@@ -400,9 +447,27 @@ def backward(ctx, grad_x, grad_probs, grad_scaling_factor, grad_tokens_per_exper
         '''
         handle = ctx.handle
         combined_hidden, combined_probs = _hybrid_ep_buffer.combine_with_unpermute(
-            hidden=grad_x, probs=grad_probs, handle=handle, pad_multiple=ctx.pad_multiple
+            hidden=grad_x,
+            probs=grad_probs,
+            handle=handle,
+            pad_multiple=ctx.pad_multiple,
+            **({"fuse_unpermute_combine": ctx.fused} if ctx.fused else {}),
+        )
+        return (
+            combined_hidden,
+            None,
+            combined_probs,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
         )
-        return combined_hidden, None, combined_probs, None, None, None, None, None, None, None
 
 
 @internal_api
@@ -412,16 +477,20 @@ class HybridEPCombine(torch.autograd.Function):
     '''
 
     @staticmethod
-    def forward(ctx, x, handle, num_permuted_tokens=None, pad_multiple=None):
+    def forward(ctx, x, handle, num_permuted_tokens=None, pad_multiple=None, fused=False):
         '''
         Forward pass of fused combine of the HybridEP backend
         '''
         combined_hidden, _ = _hybrid_ep_buffer.combine_with_unpermute(
-            hidden=x, handle=handle, pad_multiple=pad_multiple
+            hidden=x,
+            handle=handle,
+            pad_multiple=pad_multiple,
+            **({"fuse_unpermute_combine": fused} if fused else {}),
         )
         ctx.handle = handle
         ctx.pad_multiple = pad_multiple
         ctx.num_permuted_tokens = num_permuted_tokens
+        ctx.fused = fused
         return combined_hidden
 
     @staticmethod
@@ -436,6 +505,7 @@ def backward(ctx, grad_x):
             handle=handle,
             pad_multiple=ctx.pad_multiple,
             num_permuted_tokens=ctx.num_permuted_tokens,
+            **({"fuse_permute_dispatch": ctx.fused} if ctx.fused else {}),
         )
         return dispatched_hidden, None, None, None, None
 
@@ -449,10 +519,14 @@ def hybrid_ep_dispatch(
         probs,
         group,
         num_local_experts,
-        num_sms_dispatch_api=24,
-        num_sms_combine_api=24,
+        num_sms_dispatch_api=None,
+        num_sms_combine_api=None,
+        num_blocks_permute=None,
+        num_blocks_unpermute=None,
+        fused=False,
         num_permuted_tokens=None,
         pad_multiple=None,
+        num_sms_preprocessing_api=108,
     ):
         '''
         Perform fused dispatch for "permute + dispatch a2a + permute" using the
@@ -469,10 +543,14 @@ def hybrid_ep_dispatch(
                 Process group used for communication.
             num_local_experts (int):
                 Number of local experts.
-            num_sms_dispatch_api (int):
+            num_sms_dispatch_api (Optional[int]):
                 Number of SMs used by the dispatch API.
-            num_sms_combine_api (int):
+            num_sms_combine_api (Optional[int]):
                 Number of SMs used by the combine API.
+            num_blocks_permute (Optional[int]):
+                Number of blocks used by the permute part.
+            num_blocks_unpermute (Optional[int]):
+                Number of blocks used by the unpermute part.
             num_permuted_tokens (int):
                 Number of tokens after permute. HybridEP uses this to allocate buffers.
                 If not provided, HybridEP obtains the size from a GPU tensor,
@@ -480,6 +558,8 @@ def hybrid_ep_dispatch(
             pad_multiple (int):
                 Alignment multiple required for FP8 GEMM. If not provided, no padding
                 is performed.
+            num_sms_preprocessing_api (int):
+                Number of SMs used by the preprocessing (metadata scan) kernel.
         '''
         return HybridEPDispatch.apply(
             x,
@@ -489,12 +569,16 @@ def hybrid_ep_dispatch(
             num_local_experts,
             num_sms_dispatch_api,
             num_sms_combine_api,
+            num_blocks_permute,
+            num_blocks_unpermute,
+            fused,
             num_permuted_tokens,
             pad_multiple,
+            num_sms_preprocessing_api,
         )
 
     @internal_api
-    def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple):
+    def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple, fused=False):
         '''
         Perform fused combine operation for unpermute + combine a2a + unpermute
         using the HybridEP backend
@@ -511,7 +595,7 @@ def hybrid_ep_combine(x, handle, num_permuted_tokens, pad_multiple):
                 The alignment multiple required for FP8 GEMM. If not provided, no padding
                 is performed.
         '''
-        return HybridEPCombine.apply(x, handle, num_permuted_tokens, pad_multiple)
+        return HybridEPCombine.apply(x, handle, num_permuted_tokens, pad_multiple, fused)
 
 else:
     hybrid_ep_dispatch = None
diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py
index a64afee719f..11a4bd1a8b2 100644
--- a/megatron/core/transformer/moe/moe_layer.py
+++ b/megatron/core/transformer/moe/moe_layer.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import math
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Optional, Protocol
@@ -12,11 +13,13 @@
 from megatron.core.extensions.transformer_engine import HAVE_TE
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.moe.moe_logging import get_moe_overload_factor_tracker
 from megatron.core.transformer.moe.moe_utils import (
     MoECudaGraphPartialCaptureSignal,
     MoECudaGraphTensorStore,
     get_default_pg_collection,
     maybe_skip_or_early_return_by_cudagraph,
+    record_dispatch_token_counts,
 )
 from megatron.core.transformer.moe.router import TopKRouter
 from megatron.core.transformer.moe.token_dispatcher import (
@@ -233,10 +236,14 @@ def __init__(
         )
 
         self.tp_group = pg_collection.tp
+        self.tp_ep_group = pg_collection.tp_ep
 
         # Initialize router.
         self.router = self.submodules.router(
-            config=self.config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer
+            config=self.config,
+            pg_collection=pg_collection,
+            is_mtp_layer=is_mtp_layer,
+            layer_number=layer_number,
         )
         self.tp_group = pg_collection.tp
 
@@ -345,6 +352,11 @@ def __init__(
         self.cudagraph_tensor_store = MoECudaGraphTensorStore()
         self.fwd_execution_map = ["route", "expert_compute", "postprocess"]
 
+        if self.config.log_moe_overload_factor:
+            get_moe_overload_factor_tracker().set_process_groups(
+                tp_ep_group=self.tp_ep_group, expt_dp_group=pg_collection.expt_dp
+            )
+
         # Setup events and streams for delayed wgrad computation.
         self.setup_delayed_wgrad_for_dispatch_backward_overlap()
 
@@ -412,13 +424,18 @@ def unset_inference_cuda_graphed_iteration(self):
             self.shared_expert_overlap = self._saved_shared_expert_overlap
 
     @maybe_skip_or_early_return_by_cudagraph("route")
-    def route(self, hidden_states: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
+    def route(
+        self,
+        hidden_states: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+    ):
         """Compute token routing for preprocessing.
 
         This method uses the router to determine which experts to send each token to,
         producing routing probabilities and a mapping.
         """
-        probs, routing_map = apply_module(self.router)(hidden_states, padding_mask)
+        probs, routing_map = apply_module(self.router)(hidden_states, padding_mask, input_ids)
         return probs, routing_map
 
     @maybe_skip_or_early_return_by_cudagraph("preprocess")
@@ -441,6 +458,16 @@ def preprocess(
         )
         return hidden_states, probs
 
+    @staticmethod
+    def _num_token_rows_from_moe_hidden_states(hidden_states: torch.Tensor) -> int:
+        """Product of all dims except the hidden/last (same as view(-1, H) row count)."""
+        if hidden_states.dim() < 2:
+            raise ValueError(
+                "MoE hidden_states must be at least 2D [..., hidden_size], "
+                f"got shape {tuple(hidden_states.shape)}"
+            )
+        return int(math.prod(hidden_states.shape[:-1]))
+
     def dispatch(self, hidden_states: torch.Tensor, probs: torch.Tensor):
         """Dispatches tokens to assigned expert ranks via communication.
 
@@ -480,6 +507,39 @@ def shared_experts_compute(self, hidden_states: torch.Tensor):
 
         return shared_expert_output
 
+    def _maybe_record_overload_factor(
+        self, dispatched_input: torch.Tensor, tokens_per_expert: torch.Tensor
+    ) -> torch.Tensor:
+        """Wrap dispatched_input with overload logging when log_moe_overload_factor is set.
+
+        Uses _overload_log_num_local_tokens captured from forward hidden_states and
+        applies AllGather fair-share scaling so report()'s SUM over TP×EP matches one
+        global balanced count when the map is replicated on every rank.
+
+        Recording is skipped when not in training mode (e.g. Megatron validation uses
+        model.eval()) so eval forwards do not pollute train-only overload stats.
+        """
+        if not self.config.log_moe_overload_factor or not self.training:
+            return dispatched_input
+        num_local_tokens = getattr(self, "_overload_log_num_local_tokens", None)
+        if num_local_tokens is None:
+            return dispatched_input
+        tp_ep_world_size = float(self.tp_ep_group.size())
+        local_balanced_count = float(num_local_tokens) * float(self.config.moe_router_topk)
+        token_dispatcher = self.token_dispatcher
+        if isinstance(token_dispatcher, MoEAllGatherTokenDispatcher) and (
+            token_dispatcher.tp_size > 1 or token_dispatcher.ep_size > 1
+        ):
+            local_balanced_count = local_balanced_count / tp_ep_world_size
+        local_balanced = torch.empty((), device=dispatched_input.device, dtype=torch.float32)
+        local_balanced.fill_(local_balanced_count)
+        return record_dispatch_token_counts(
+            tensor=dispatched_input,
+            tokens_per_expert=tokens_per_expert,
+            local_balanced_token_count=local_balanced,
+            layer_number=self.layer_number,
+        )
+
     @internal_api
     def routed_experts_compute(self, hidden_states: torch.Tensor, probs: torch.Tensor):
         """Computes the output of the routed experts on the dispatched tokens.
@@ -492,9 +552,11 @@ def routed_experts_compute(self, hidden_states: torch.Tensor, probs: torch.Tenso
             hidden_states = _RecordExpertDgradCompletion.apply(
                 self._delayed_wgrad_event, hidden_states
             )
+
         dispatched_input, tokens_per_expert, permuted_probs = (
             self.token_dispatcher.dispatch_postprocess(hidden_states, probs)
         )
+        dispatched_input = self._maybe_record_overload_factor(dispatched_input, tokens_per_expert)
         if (
             hasattr(self, "_inference_token_dispatcher")
             and self.is_inference_cuda_graphed_iteration
@@ -545,6 +607,7 @@ def forward(
         hidden_states: torch.Tensor,
         intermediate_tensors=None,
         padding_mask: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
     ):
         """Forward pass for the MoE layer.
 
@@ -559,6 +622,8 @@ def forward(
             padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
                                                    Shape [seq_length, bsz]. True for valid tokens,
                                                    False for padding tokens. Defaults to None.
+            input_ids (torch.Tensor, optional): The input IDs tensor. Shape [seq_length, bsz].
+                                                Defaults to None.
         Returns:
             A tuple containing the output tensor and the MLP bias, if any.
         """
@@ -576,7 +641,11 @@ def custom_forward(hidden_states, intermediate_tensors=None, padding_mask=None):
             try:
                 if "route" in self.fwd_execution_map:
                     shared_expert_output = self.shared_experts_compute(hidden_states)
-                    probs, routing_map = self.route(hidden_states, padding_mask)
+                    if self.config.log_moe_overload_factor and self.training:
+                        self._overload_log_num_local_tokens = (
+                            self._num_token_rows_from_moe_hidden_states(hidden_states)
+                        )
+                    probs, routing_map = self.route(hidden_states, padding_mask, input_ids)
                     hidden_states, probs = self.preprocess(hidden_states, probs, routing_map)
 
                     if intermediate_tensors is not None:
diff --git a/megatron/core/transformer/moe/moe_logging.py b/megatron/core/transformer/moe/moe_logging.py
new file mode 100644
index 00000000000..16b60f66276
--- /dev/null
+++ b/megatron/core/transformer/moe/moe_logging.py
@@ -0,0 +1,745 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""MoE metrics tracking and logging.
+
+Collects per-layer MoE metrics during forward passes, synchronizes them across
+distributed ranks, and writes scalar summaries to TensorBoard / W&B.
+
+Usage:
+    tracker = get_moe_metrics_tracker()
+
+    # In router forward pass:
+    tracker.record("load_balancing_loss", loss, layer_number=1, num_layers=32,
+                   reduce_group=tp_cp_group)
+
+    # At end of training step:
+    log_str = tracker.report(
+        loss_scale=1 / num_microbatches,
+        iteration=step,
+        writer=tb_writer,
+        num_layers=32,
+    )
+"""
+
+from dataclasses import dataclass
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.process_groups_config import ProcessGroupCollection
+
+
+@dataclass
+class MetricEntry:
+    """Per-layer metric with distributed reduction configuration."""
+
+    values: torch.Tensor
+    reduce_group: Optional[torch.distributed.ProcessGroup] = None
+    avg_group: Optional[torch.distributed.ProcessGroup] = None
+    needs_dp_avg: bool = True
+
+
+# ---------------------------------------------------------------------------
+# Module-level global tracker (follows parallel_state / global_vars pattern)
+# ---------------------------------------------------------------------------
+_MOE_METRICS_TRACKER: Optional['MoEMetricsTracker'] = None
+
+
+def get_moe_metrics_tracker() -> 'MoEMetricsTracker':
+    """Return the global MoE metrics tracker, creating it lazily if needed."""
+    global _MOE_METRICS_TRACKER
+    if _MOE_METRICS_TRACKER is None:
+        _MOE_METRICS_TRACKER = MoEMetricsTracker()
+    return _MOE_METRICS_TRACKER
+
+
+def set_moe_metrics_tracker(tracker: 'MoEMetricsTracker') -> None:
+    """Set the global MoE metrics tracker."""
+    global _MOE_METRICS_TRACKER
+    _MOE_METRICS_TRACKER = tracker
+
+
+def destroy_moe_metrics_tracker() -> None:
+    """Reset the global MoE metrics tracker to ``None``."""
+    global _MOE_METRICS_TRACKER
+    _MOE_METRICS_TRACKER = None
+
+
+# ---------------------------------------------------------------------------
+# MoE Overload Factor Tracker (same pattern as MoEMetricsTracker)
+# ---------------------------------------------------------------------------
+_MOE_OVERLOAD_FACTOR_TRACKER: Optional['MoEOverloadFactorTracker'] = None
+
+
+def get_moe_overload_factor_tracker() -> 'MoEOverloadFactorTracker':
+    """Return the global MoE overload factor tracker, creating it lazily if needed."""
+    global _MOE_OVERLOAD_FACTOR_TRACKER
+    if _MOE_OVERLOAD_FACTOR_TRACKER is None:
+        _MOE_OVERLOAD_FACTOR_TRACKER = MoEOverloadFactorTracker()
+    return _MOE_OVERLOAD_FACTOR_TRACKER
+
+
+def set_moe_overload_factor_tracker(tracker: 'MoEOverloadFactorTracker') -> None:
+    """Set the global MoE overload factor tracker."""
+    global _MOE_OVERLOAD_FACTOR_TRACKER
+    _MOE_OVERLOAD_FACTOR_TRACKER = tracker
+
+
+def destroy_moe_overload_factor_tracker() -> None:
+    """Reset the global MoE overload factor tracker to None."""
+    global _MOE_OVERLOAD_FACTOR_TRACKER
+    _MOE_OVERLOAD_FACTOR_TRACKER = None
+
+
+class MoEOverloadFactorTracker:
+    """Tracker for MoE overload-factor metrics.
+
+    Reductions are over tp_ep then expt_dp (expert data parallel), not dense dp,
+    so overload stats stay within the same expert partition across replicas.
+
+    Lifecycle: MoELayer records counts when log_moe_overload_factor is set (training only);
+    report() at step end (sync, aggregate, log, deferred clear) → repeat.
+
+    Example:
+        tracker = get_moe_overload_factor_tracker()
+        log_str = tracker.report(iteration=100, writer=tb_writer)
+    """
+
+    def __init__(self) -> None:
+        self._layer_fwd_tokens: Dict[int, List[torch.Tensor]] = {}
+        # layer_idx -> list of 0-dim float (tokens on rank)
+        self._layer_fwd_balanced: Dict[int, List[torch.Tensor]] = {}
+        # same keys as _layer_fwd_tokens, balanced token count per entry
+        self._cumulative_tokens_timeline: List[torch.Tensor] = []
+        # +actual tokens on forward, - on backward (mirrors balanced timeline).
+        self._cumulative_balanced_timeline: List[torch.Tensor] = []
+        self._tp_ep_group: Optional[torch.distributed.ProcessGroup] = None
+        self._expt_dp_group: Optional[torch.distributed.ProcessGroup] = None
+        self._pending_clear: bool = False
+
+    def set_process_groups(
+        self,
+        tp_ep_group: Optional[torch.distributed.ProcessGroup] = None,
+        expt_dp_group: Optional[torch.distributed.ProcessGroup] = None,
+    ) -> None:
+        """Set process groups for reduction (MoELayer.__init__ when log_moe_overload_factor)."""
+        if tp_ep_group is not None:
+            self._tp_ep_group = tp_ep_group
+        if expt_dp_group is not None:
+            self._expt_dp_group = expt_dp_group
+
+    def _clear_storage(self) -> None:
+        self._layer_fwd_tokens.clear()
+        self._layer_fwd_balanced.clear()
+        self._cumulative_tokens_timeline.clear()
+        self._cumulative_balanced_timeline.clear()
+
+    def _flush_pending_clear(self) -> None:
+        if self._pending_clear:
+            self._pending_clear = False
+            self._clear_storage()
+
+    def record_fwd(
+        self,
+        layer_number: Optional[int],
+        tokens_on_rank: torch.Tensor,
+        local_balanced_token_count: torch.Tensor,
+    ) -> None:
+        """Record forward token total on this rank (0-dim float) and balanced count scalar."""
+        self._flush_pending_clear()
+        if layer_number is None:
+            return
+        layer_idx = layer_number - 1
+        if layer_idx not in self._layer_fwd_tokens:
+            self._layer_fwd_tokens[layer_idx] = []
+            self._layer_fwd_balanced[layer_idx] = []
+        self._layer_fwd_tokens[layer_idx].append(tokens_on_rank.detach())
+        self._layer_fwd_balanced[layer_idx].append(local_balanced_token_count.detach())
+        self._cumulative_tokens_timeline.append(tokens_on_rank.detach())
+        self._cumulative_balanced_timeline.append(local_balanced_token_count.detach())
+
+    def record_bwd(
+        self, tokens_on_rank: torch.Tensor, local_balanced_token_count: torch.Tensor
+    ) -> None:
+        """Record backward-pass (negated actual and balanced count) for paired cumsums."""
+        self._flush_pending_clear()
+        self._cumulative_tokens_timeline.append(-tokens_on_rank.detach())
+        self._cumulative_balanced_timeline.append(-local_balanced_token_count.detach())
+
+    def _pipeline_group_and_use_reduce(
+        self,
+    ) -> Tuple[Optional[torch.distributed.ProcessGroup], bool]:
+        pp_group = (
+            parallel_state.get_pipeline_model_parallel_group(check_initialized=False)
+            if torch.distributed.is_initialized()
+            else None
+        )
+        use_pp_reduce = (
+            pp_group is not None and torch.distributed.get_world_size(group=pp_group) > 1
+        )
+        return pp_group, use_pp_reduce
+
+    def _flatten_recorded_tokens(self) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        fwd_tensors: List[torch.Tensor] = []
+        balanced_tensors: List[torch.Tensor] = []
+        if self._layer_fwd_tokens:
+            for layer_idx in sorted(self._layer_fwd_tokens.keys()):
+                for t, b in zip(
+                    self._layer_fwd_tokens[layer_idx], self._layer_fwd_balanced[layer_idx]
+                ):
+                    fwd_tensors.append(t)
+                    balanced_tensors.append(b)
+        return fwd_tensors, balanced_tensors
+
+    def _pp_allreduce_empty_tracker(self, pp_group: torch.distributed.ProcessGroup) -> None:
+        """Ranks without MoE still join PP collectives so peers do not hang."""
+        device = (
+            torch.device('cuda', torch.cuda.current_device())
+            if torch.cuda.is_available()
+            else torch.device('cpu')
+        )
+        pp_buf = torch.zeros(2, device=device, dtype=torch.float32)
+        torch.distributed.all_reduce(pp_buf, group=pp_group, op=torch.distributed.ReduceOp.MAX)
+
+    def _validate_overload_tensor_lists(
+        self, num_entries: int, num_layers: int, num_balanced: int
+    ) -> None:
+        if num_entries % num_layers != 0:
+            raise ValueError(
+                f"Overload factor tracker: num_entries ({num_entries}) must be "
+                f"divisible by num_layers ({num_layers})."
+            )
+        if num_balanced != num_entries:
+            raise ValueError(
+                f"Overload factor tracker: balanced_tensors length ({num_balanced}) "
+                f"must match fwd_tensors ({num_entries})."
+            )
+
+    def _max_cum_overload_if_timeline(
+        self,
+        tp_ep_group: Optional[torch.distributed.ProcessGroup],
+        expt_dp_group: Optional[torch.distributed.ProcessGroup],
+    ) -> Optional[float]:
+        """Cumulative actual vs balanced token count; ratio of peaks across ranks."""
+        if not self._cumulative_tokens_timeline:
+            return None
+        if len(self._cumulative_balanced_timeline) != len(self._cumulative_tokens_timeline):
+            raise ValueError(
+                f"Overload tracker: _cumulative_tokens_timeline "
+                f"({len(self._cumulative_tokens_timeline)}) and "
+                f"_cumulative_balanced_timeline "
+                f"({len(self._cumulative_balanced_timeline)}) length mismatch."
+            )
+        fwd_bwd_stacked = torch.stack(
+            [t.float() for t in self._cumulative_tokens_timeline], dim=0
+        )  # [num_events]
+        balanced_fwd_bwd_stacked = torch.stack(
+            [t.float() for t in self._cumulative_balanced_timeline], dim=0
+        )
+        cum_actual = fwd_bwd_stacked.cumsum(dim=0)
+        cum_balanced = balanced_fwd_bwd_stacked.cumsum(dim=0)
+        local_actual_peak = cum_actual.max()
+        local_balanced_peak = cum_balanced.max()
+        cum_overload_ratio = torch.where(
+            local_balanced_peak > 0,
+            local_actual_peak / (local_balanced_peak + 1e-8),
+            local_actual_peak.new_zeros(()),
+        ).unsqueeze(0)
+        if tp_ep_group is not None:
+            torch.distributed.all_reduce(
+                cum_overload_ratio, group=tp_ep_group, op=torch.distributed.ReduceOp.MAX
+            )
+        if expt_dp_group is not None:
+            torch.distributed.all_reduce(
+                cum_overload_ratio, group=expt_dp_group, op=torch.distributed.ReduceOp.MAX
+            )
+        return cum_overload_ratio.item()
+
+    def _tp_ep_overload_from_lists(
+        self,
+        fwd_tensors: List[torch.Tensor],
+        balanced_tensors: List[torch.Tensor],
+        tp_ep_group: Optional[torch.distributed.ProcessGroup],
+    ) -> Tuple[torch.Tensor, torch.device]:
+        """Max actual per entry over tp_ep, balanced sum per entry, then overload ratio."""
+        actual_tokens_stacked = torch.stack([t.float() for t in fwd_tensors], dim=0)
+        device = actual_tokens_stacked.device
+        if tp_ep_group is not None:
+            tp_ep_world = float(tp_ep_group.size())
+            max_actual = actual_tokens_stacked.clone()
+            torch.distributed.all_reduce(
+                max_actual, group=tp_ep_group, op=torch.distributed.ReduceOp.MAX
+            )
+        else:
+            tp_ep_world = 1.0
+            max_actual = actual_tokens_stacked
+
+        balanced_stacked = torch.stack(
+            [b.to(device=device, dtype=torch.float32) for b in balanced_tensors], dim=0
+        )
+        if tp_ep_group is not None:
+            torch.distributed.all_reduce(balanced_stacked, group=tp_ep_group)
+        balanced_per_rank = balanced_stacked / tp_ep_world
+        tp_ep_overload = max_actual / (balanced_per_rank + 1e-8)
+        return tp_ep_overload, device
+
+    def _expt_dp_reduce_overload(
+        self, tp_ep_overload: torch.Tensor, expt_dp_group: Optional[torch.distributed.ProcessGroup]
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Average and worst-case overload across expert-DP replicas (per entry)."""
+        if expt_dp_group is not None:
+            overload_avg = tp_ep_overload.clone()
+            torch.distributed.all_reduce(
+                overload_avg, group=expt_dp_group, op=torch.distributed.ReduceOp.AVG
+            )
+            overload_max = tp_ep_overload.clone()
+            torch.distributed.all_reduce(
+                overload_max, group=expt_dp_group, op=torch.distributed.ReduceOp.MAX
+            )
+        else:
+            overload_avg = tp_ep_overload
+            overload_max = tp_ep_overload
+        return overload_avg, overload_max
+
+    def _pp_reduce_max_overload_scalars(
+        self,
+        max_overload_factor: float,
+        max_cum_overload_factor: Optional[float],
+        device: torch.device,
+        pp_group: torch.distributed.ProcessGroup,
+    ) -> Tuple[float, float]:
+        max_cum_value = (
+            float(max_cum_overload_factor) if max_cum_overload_factor is not None else 0.0
+        )
+        pp_buf = torch.tensor(
+            [max_overload_factor, max_cum_value], device=device, dtype=torch.float32
+        )
+        torch.distributed.all_reduce(pp_buf, group=pp_group, op=torch.distributed.ReduceOp.MAX)
+        return pp_buf[0].item(), pp_buf[1].item()
+
+    def _log_overload_metrics(
+        self,
+        iteration: int,
+        writer,
+        wandb_writer,
+        avg_overload_factor: float,
+        max_overload_factor: float,
+        max_cum_overload_factor: Optional[float],
+        per_layer_logging: bool,
+        overload_avg: torch.Tensor,
+        overload_max: torch.Tensor,
+        num_layers: int,
+        num_entries: int,
+    ) -> None:
+        if writer is not None:
+            writer.add_scalar("moe/avg_overload_factor", avg_overload_factor, iteration)
+            writer.add_scalar("moe/max_overload_factor", max_overload_factor, iteration)
+            if max_cum_overload_factor is not None:
+                writer.add_scalar("moe/max_cum_overload_factor", max_cum_overload_factor, iteration)
+        if wandb_writer is not None:
+            wandb_writer.log({"moe/avg_overload_factor": avg_overload_factor}, iteration)
+            wandb_writer.log({"moe/max_overload_factor": max_overload_factor}, iteration)
+            if max_cum_overload_factor is not None:
+                wandb_writer.log(
+                    {"moe/max_cum_overload_factor": max_cum_overload_factor}, iteration
+                )
+
+        if per_layer_logging:
+            entries_per_layer = num_entries // num_layers
+            layer_avg = overload_avg.view(num_layers, entries_per_layer).mean(dim=1)
+            layer_max = overload_max.view(num_layers, entries_per_layer).max(dim=1).values
+            for i in range(num_layers):
+                avg_val, max_val = layer_avg[i].item(), layer_max[i].item()
+                if writer is not None:
+                    writer.add_scalar(f"moe/avg_overload_factor_layer_{i}", avg_val, iteration)
+                    writer.add_scalar(f"moe/max_overload_factor_layer_{i}", max_val, iteration)
+                if wandb_writer is not None:
+                    wandb_writer.log(
+                        {
+                            f"moe/avg_overload_factor_layer_{i}": avg_val,
+                            f"moe/max_overload_factor_layer_{i}": max_val,
+                        },
+                        iteration,
+                    )
+
+    def report(
+        self, iteration: int, writer=None, wandb_writer=None, per_layer_logging: bool = False
+    ) -> str:
+        """Reduce data, overload factors, log to TB/W&B, defer clear, return log string."""
+        pp_group, use_pp_reduce = self._pipeline_group_and_use_reduce()
+        tp_ep_group = self._tp_ep_group
+        expt_dp_group = self._expt_dp_group
+        fwd_tensors, balanced_tensors = self._flatten_recorded_tokens()
+
+        if not fwd_tensors:
+            if use_pp_reduce:
+                assert pp_group is not None
+                self._pp_allreduce_empty_tracker(pp_group)
+            self.clear()
+            return ""
+
+        num_entries = len(fwd_tensors)
+        num_layers = len(self._layer_fwd_tokens)
+        self._validate_overload_tensor_lists(num_entries, num_layers, len(balanced_tensors))
+
+        max_cum_overload_factor = self._max_cum_overload_if_timeline(tp_ep_group, expt_dp_group)
+        tp_ep_overload, device = self._tp_ep_overload_from_lists(
+            fwd_tensors, balanced_tensors, tp_ep_group
+        )
+        overload_avg, overload_max = self._expt_dp_reduce_overload(tp_ep_overload, expt_dp_group)
+
+        avg_overload_factor = overload_avg.mean().item()
+        max_overload_factor = overload_max.max().item()
+
+        if use_pp_reduce:
+            assert pp_group is not None
+            max_overload_factor, max_cum_reduced = self._pp_reduce_max_overload_scalars(
+                max_overload_factor, max_cum_overload_factor, device, pp_group
+            )
+            max_cum_overload_factor = max_cum_reduced
+
+        self._log_overload_metrics(
+            iteration,
+            writer,
+            wandb_writer,
+            avg_overload_factor,
+            max_overload_factor,
+            max_cum_overload_factor,
+            per_layer_logging,
+            overload_avg,
+            overload_max,
+            num_layers,
+            num_entries,
+        )
+
+        self.clear()
+
+        parts = [
+            f" avg overload factor: {avg_overload_factor:.3f} |",
+            f" max overload factor: {max_overload_factor:.3f} |",
+        ]
+        if max_cum_overload_factor is not None:
+            parts.append(f" max cum overload factor: {max_cum_overload_factor:.3f} |")
+        return "".join(parts)
+
+    def clear(self) -> None:
+        """Mark stored tensors for reset on the next record_fwd or record_bwd.
+
+        Does not drop list contents yet, so captured tensor references stay valid
+        until the next recording hook runs. Process groups are kept.
+        """
+        self._pending_clear = True
+
+
+class MoEMetricsTracker:
+    """Tracker for MoE layer-wise metrics.
+
+    Lifecycle: ``record()`` per-layer values during forward → ``report()`` at
+    step end (sync, aggregate, log, clear) → repeat.
+
+    Example:
+        tracker = get_moe_metrics_tracker()
+        tracker.record("load_balancing_loss", loss, layer_number=1, num_layers=32)
+        log_str = tracker.report(loss_scale=1/8, iteration=100, writer=tb_writer,
+                                 num_layers=32)
+    """
+
+    def __init__(self):
+        self._metrics: Dict[str, MetricEntry] = {}
+
+    # =========================================================================
+    # Public API
+    # =========================================================================
+
+    @property
+    def metrics(self) -> Dict[str, MetricEntry]:
+        """Read-only access to the underlying metric entries."""
+        return self._metrics
+
+    def record(
+        self,
+        name: str,
+        value: torch.Tensor,
+        layer_number: int,
+        num_layers: int,
+        reduce_group: Optional[torch.distributed.ProcessGroup] = None,
+        avg_group: Optional[torch.distributed.ProcessGroup] = None,
+        needs_dp_avg: bool = True,
+    ) -> None:
+        """Accumulate a metric value for a specific layer.
+
+        Called during the router forward pass.  Lazily creates the metric entry
+        on first call for each metric name.
+
+        Args:
+            name: Metric name (e.g. ``"load_balancing_loss"``).
+            value: Scalar tensor to accumulate (will be detached).
+            layer_number: 1-based layer index.
+            num_layers: Total number of layers (determines tensor size).
+            reduce_group: Process group for sum-reduction (e.g. tp_cp_group).
+            avg_group: Process group for average-reduction.
+            needs_dp_avg: Whether to average across DP ranks after other reductions.
+        """
+        if layer_number is None:
+            return
+
+        if name not in self._metrics:
+            self._metrics[name] = MetricEntry(values=torch.zeros(num_layers, device=value.device))
+
+        entry = self._metrics[name]
+        entry.values[layer_number - 1] += value.detach()
+        entry.reduce_group = reduce_group
+        entry.avg_group = avg_group
+        entry.needs_dp_avg = needs_dp_avg
+
+    def report(
+        self,
+        loss_scale: float,
+        iteration: int,
+        writer=None,
+        wandb_writer=None,
+        per_layer_logging: bool = False,
+        force_initialize: bool = False,
+        track_names: Optional[Union[str, List[str]]] = None,
+        num_layers: Optional[int] = None,
+        moe_layer_freq: Optional[Union[int, List[int]]] = None,
+        mtp_num_layers: Optional[int] = None,
+        total_loss_dict: Optional[dict[str, torch.Tensor]] = None,
+        percentiles: Optional[Dict[str, List[float]]] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+    ) -> str:
+        """Sync metrics across ranks, aggregate, log, and clear.
+
+        This is the main entry point called once per training step.  It pairs
+        with :meth:`record`: you *record* individual data points during forward,
+        then *report* the summary at step end.
+
+        Args:
+            loss_scale: Scale factor for averaging across microbatches
+                (usually ``1 / num_microbatches``).
+            iteration: Current training iteration.
+            writer: TensorBoard ``SummaryWriter`` (optional).
+            wandb_writer: Weights & Biases run object (optional).
+            per_layer_logging: Whether to also write per-layer values.
+            force_initialize: If True, pre-create metric entries for *track_names*
+                that don't exist yet.  Required for PP ranks without MoE layers
+                whose tensor sizes must match ranks that do have MoE layers.
+            track_names: Metric name(s) to report.  ``None`` reports all.
+            num_layers: Total transformer layers (required when *force_initialize*).
+            moe_layer_freq: MoE layer frequency or binary pattern list.
+            mtp_num_layers: Extra layers from Multi-Token Prediction.
+            total_loss_dict: Megatron training-loop accumulator.  Metrics
+                ending with ``"loss"`` are accumulated here and excluded from
+                the returned console log string.
+            percentiles: Per-metric percentiles to compute, e.g.
+                ``{"load_imbalance": [0.5, 0.95]}``.
+            pg_collection: Custom process-group collection for reduction.
+
+        Returns:
+            Formatted log string for console output.
+        """
+        metric_names = self._resolve_names(track_names)
+
+        # Pre-create entries on PP ranks that lack MoE layers.
+        # Tensor size must be (num_layers + mtp_num_layers) to match ranks that
+        # recorded via record(), otherwise all_reduce across PP will hang.
+        if force_initialize:
+            if num_layers is None:
+                raise ValueError("num_layers must be provided when force_initialize=True.")
+            init_size = num_layers + (mtp_num_layers or 0)
+            for name in metric_names:
+                self.ensure_initialized(name, init_size)
+
+        self._sync_metrics(metric_names, pg_collection)
+
+        num_moe_layers = self._count_moe_layers(num_layers, moe_layer_freq, mtp_num_layers)
+        scalars = self._aggregate(loss_scale, num_moe_layers, metric_names, percentiles)
+
+        # Megatron integration: accumulate loss metrics into total_loss_dict
+        console_scalars = dict(scalars)
+        if total_loss_dict is not None:
+            for k, v in scalars.items():
+                if k.lower().endswith("loss"):
+                    if k in total_loss_dict:
+                        total_loss_dict[k] += v
+                    else:
+                        total_loss_dict[k] = v
+                    console_scalars.pop(k)
+
+        self._log_scalars(scalars, iteration, writer, wandb_writer)
+        if per_layer_logging:
+            self._log_per_layer(
+                loss_scale, metric_names, iteration, writer, wandb_writer, percentiles
+            )
+
+        log_string = self._format(console_scalars)
+        self.clear()
+        return log_string
+
+    def clear(self) -> None:
+        """Zero out all metric values (entries are kept for reuse)."""
+        for entry in self._metrics.values():
+            entry.values.zero_()
+
+    def ensure_initialized(
+        self, name: str, num_layers: int, device: Optional[Union[str, torch.device, int]] = None
+    ) -> None:
+        """Pre-create a metric entry if it does not already exist.
+
+        This is needed for PP ranks that have no MoE layers -- their tensor
+        size must match ranks that do, otherwise ``all_reduce`` across PP hangs.
+
+        Args:
+            name: Metric name.
+            num_layers: Tensor size (should include MTP layers).
+            device: Device for the zero tensor.  Defaults to current CUDA device.
+        """
+        if name not in self._metrics:
+            if device is None:
+                device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
+            self._metrics[name] = MetricEntry(values=torch.zeros(num_layers, device=device))
+
+    # =========================================================================
+    # Private implementation
+    # =========================================================================
+
+    def _resolve_names(self, track_names: Optional[Union[str, List[str]]]) -> List[str]:
+        """Normalize *track_names* argument to a list of strings."""
+        if track_names is None:
+            return list(self._metrics.keys())
+        if isinstance(track_names, str):
+            return [track_names]
+        return track_names
+
+    def _sync_metrics(
+        self, metric_names: List[str], pg_collection: Optional[ProcessGroupCollection] = None
+    ) -> None:
+        """All-reduce metrics across distributed ranks.
+
+        Reduction order: PP collect → reduce_group sum → avg_group avg → DP avg.
+        """
+        if pg_collection is None:
+            pp_group = parallel_state.get_pipeline_model_parallel_group()
+            dp_group = parallel_state.get_data_parallel_group(
+                with_context_parallel=False, partial_data_parallel=False
+            )
+        else:
+            pp_group = pg_collection.pp
+            dp_group = pg_collection.dp
+
+        for name in metric_names:
+            if name not in self._metrics:
+                continue
+
+            entry = self._metrics[name]
+            v = entry.values
+
+            torch.distributed.all_reduce(v, group=pp_group)
+
+            if entry.reduce_group is not None:
+                torch.distributed.all_reduce(v, group=entry.reduce_group)
+
+            if entry.avg_group is not None:
+                torch.distributed.all_reduce(
+                    v, group=entry.avg_group, op=torch.distributed.ReduceOp.AVG
+                )
+
+            if entry.needs_dp_avg:
+                torch.distributed.all_reduce(v, group=dp_group, op=torch.distributed.ReduceOp.AVG)
+
+    @staticmethod
+    def _count_moe_layers(
+        num_layers: Optional[int],
+        moe_layer_freq: Optional[Union[int, List[int]]],
+        mtp_num_layers: Optional[int],
+    ) -> int:
+        """Compute the effective number of MoE layers from configuration."""
+        if moe_layer_freq is None:
+            n = num_layers
+        elif isinstance(moe_layer_freq, int):
+            assert isinstance(num_layers, int)
+            n = sum(1 for i in range(num_layers) if i % moe_layer_freq == 0)
+        elif isinstance(moe_layer_freq, list):
+            n = sum(moe_layer_freq)
+        else:
+            raise ValueError(f"Invalid moe_layer_freq: {moe_layer_freq}")
+
+        if mtp_num_layers is not None:
+            n += mtp_num_layers
+
+        return n
+
+    def _aggregate(
+        self,
+        loss_scale: float,
+        num_moe_layers: int,
+        metric_names: List[str],
+        percentiles: Optional[Dict[str, List[float]]] = None,
+    ) -> Dict[str, Union[float, torch.Tensor]]:
+        """Aggregate per-layer values into scalar summaries.
+
+        Always computes the mean across MoE layers.  If *percentiles* specifies
+        quantiles for a metric, those are computed over non-zero layer values and
+        added as ``"{name}_p{pct}"`` keys.
+        """
+        result: Dict[str, Union[float, torch.Tensor]] = {}
+
+        for name in metric_names:
+            if name not in self._metrics:
+                continue
+
+            values = self._metrics[name].values.float() * loss_scale
+
+            if percentiles and name in percentiles:
+                nonzero = values[values > 0]
+                if nonzero.numel() > 0:
+                    pcts = percentiles[name]
+                    pct_vals = torch.quantile(
+                        nonzero, torch.tensor(pcts, device=nonzero.device)
+                    ).tolist()
+                    for pct, pct_val in zip(pcts, pct_vals):
+                        result[f"{name}_p{int(pct * 100)}"] = pct_val
+
+            result[name] = values.sum() / num_moe_layers
+
+        return result
+
+    def _log_scalars(
+        self, scalars: Dict[str, Union[float, torch.Tensor]], iteration: int, writer, wandb_writer
+    ) -> None:
+        """Write scalar metrics to TensorBoard and/or W&B."""
+        for name, value in scalars.items():
+            if writer is not None:
+                writer.add_scalar(name, value, iteration)
+            if wandb_writer is not None:
+                wandb_writer.log({name: value}, iteration)
+
+    def _log_per_layer(
+        self,
+        loss_scale: float,
+        metric_names: List[str],
+        iteration: int,
+        writer,
+        wandb_writer,
+        percentiles: Optional[Dict[str, List[float]]] = None,
+    ) -> None:
+        """Write per-layer metric values to TensorBoard and/or W&B."""
+        for name in metric_names:
+            if name not in self._metrics:
+                continue
+
+            values = self._metrics[name].values.float() * loss_scale
+            is_sparse = percentiles is not None and name in percentiles
+            for i, val in enumerate(values.tolist()):
+                if is_sparse and val == 0:
+                    continue
+                if writer is not None:
+                    writer.add_scalar(f"moe/{name}_layer_{i}", val, iteration)
+                if wandb_writer is not None:
+                    wandb_writer.log({f"moe/{name}_layer_{i}": val}, iteration)
+
+    @staticmethod
+    def _format(scalars: Dict[str, Union[float, torch.Tensor]]) -> str:
+        """Format aggregated metrics as a console log string."""
+        return "".join(f" {k}: {v:.2f} |" for k, v in scalars.items())
diff --git a/megatron/core/transformer/moe/moe_utils.py b/megatron/core/transformer/moe/moe_utils.py
index 896495d0710..d316d23de10 100644
--- a/megatron/core/transformer/moe/moe_utils.py
+++ b/megatron/core/transformer/moe/moe_utils.py
@@ -1,5 +1,4 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-
 import functools
 import math
 from dataclasses import dataclass
@@ -20,9 +19,13 @@
 from megatron.core.tensor_parallel.mappings import reduce_from_tensor_model_parallel_region
 from megatron.core.transformer.cuda_graphs import is_graph_capturing
 from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.moe.moe_logging import (
+    get_moe_metrics_tracker,
+    get_moe_overload_factor_tracker,
+)
 from megatron.core.transformer.moe.router_replay import RouterReplay
 from megatron.core.transformer.transformer_config import TransformerConfig
-from megatron.core.utils import internal_api, is_te_min_version
+from megatron.core.utils import deprecated, internal_api, is_te_min_version
 
 if HAVE_TE:
     from megatron.core.extensions.transformer_engine import (
@@ -52,10 +55,6 @@
     ) = (None, None, None, None, None, None, None, None, None, None)
 
 
-# MOE logging
-_MOE_LAYER_WISE_LOGGING_TRACKER: dict = {}
-
-
 def switch_load_balancing_loss_func(
     probs: torch.Tensor,
     tokens_per_expert: torch.Tensor,
@@ -966,6 +965,9 @@ def apply_router_token_dropping(
     return final_probs, final_map
 
 
+@deprecated(
+    version="0.16", removal_version="0.18", alternative="get_moe_metrics_tracker().record()"
+)
 def save_to_aux_losses_tracker(
     name: str,
     loss: torch.Tensor,
@@ -982,38 +984,112 @@ def save_to_aux_losses_tracker(
         layer_number (int): Layer index of the loss.
         num_layers (int): The number of total layers.
         reduce_group (torch.distributed.ProcessGroup, optional): The group for reducing the loss.
-                                                                 Defaults to None.
+            Defaults to None.
         avg_group (torch.distributed.ProcessGroup, optional): The group for averaging the loss.
-                                                              Defaults to None.
-        reduce_group_has_dp (bool, optional): Whether the reduce group has data parallel ranks.
-            Set this to True if the reduce group has data parallel ranks. This flag is used to
-            ensure the correct reduction in aux loss tracking. Defaults to False.
+            Defaults to None.
+        reduce_group_has_dp (bool, optional): Whether the reduce group already includes DP ranks.
+            If True, DP averaging is skipped. Defaults to False.
     """
-    # Skip aux loss logging if layer_number is None.
-    if layer_number is None:
-        return
-
-    tracker = get_moe_layer_wise_logging_tracker()
-    if name not in tracker:
-        tracker[name] = {}
-        tracker[name]["values"] = torch.zeros(num_layers, device=loss.device)
-    tracker[name]["values"][layer_number - 1] += loss.detach()  # Aggregate the loss for the layer.
-    tracker[name]["reduce_group"] = reduce_group
-    tracker[name]["avg_group"] = avg_group
-    tracker[name]["reduce_group_has_dp"] = reduce_group_has_dp
+    get_moe_metrics_tracker().record(
+        name=name,
+        value=loss,
+        layer_number=layer_number,
+        num_layers=num_layers,
+        reduce_group=reduce_group,
+        avg_group=avg_group,
+        needs_dp_avg=not reduce_group_has_dp,
+    )
 
 
+@deprecated(version="0.16", removal_version="0.18", alternative="get_moe_metrics_tracker().clear()")
 def clear_aux_losses_tracker() -> None:
     """Clear the auxiliary losses."""
-    tracker = get_moe_layer_wise_logging_tracker()
-    for name in tracker:
-        tracker[name]["values"].zero_()
+    get_moe_metrics_tracker().clear()
+
 
+class RecordDispatchTokenCountsFunction(torch.autograd.Function):
+    """Autograd hook: post-dispatch token totals for overload reporting (see report())."""
 
+    @staticmethod
+    def forward(
+        ctx,
+        tensor: torch.Tensor,
+        tokens_per_expert: torch.Tensor,
+        local_balanced_token_count: torch.Tensor,
+        layer_number: Optional[int],
+    ):
+        """Record actual token total and balanced count on forward; pass tensor through.
+
+        Args:
+            tensor: Tensor in the autograd graph (e.g. dispatched_input) — pass-through.
+            tokens_per_expert: Per-local-expert counts from dispatch_postprocess (any
+                device).
+            local_balanced_token_count: Scalar float, num_local_tokens * topk (token
+                rows from forward hidden_states).
+            layer_number: Layer index (1-based).
+
+        Returns:
+            tensor unchanged.
+        """
+        if layer_number is None:
+            return tensor
+
+        tokens_on_rank = (
+            tokens_per_expert.detach().sum().to(device=tensor.device, dtype=torch.float32)
+        )
+
+        balanced = local_balanced_token_count.detach().to(device=tensor.device, dtype=torch.float32)
+
+        tracker = get_moe_overload_factor_tracker()
+        tracker.record_fwd(layer_number, tokens_on_rank, balanced)
+
+        ctx.save_for_backward(tokens_on_rank, balanced)
+        return tensor
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        """Backward pass: append negated actual and balanced count for paired cumsums."""
+        if ctx.saved_tensors:
+            tokens_on_rank, balanced = ctx.saved_tensors
+            get_moe_overload_factor_tracker().record_bwd(tokens_on_rank, balanced)
+        return grad_output, None, None, None
+
+
+def record_dispatch_token_counts(
+    tensor: torch.Tensor,
+    tokens_per_expert: torch.Tensor,
+    local_balanced_token_count: torch.Tensor,
+    layer_number: Optional[int],
+) -> torch.Tensor:
+    """Wrap tensor with an autograd hook for dispatch token counts (overload metrics).
+
+    Records tokens_per_expert.sum() on this rank and the balanced token count scalar.
+    Overload factors are computed later in MoEOverloadFactorTracker.report(). Process groups
+    must already be registered on the global tracker (MoELayer does this in __init__
+    when log_moe_overload_factor is enabled).
+
+    Args:
+        tensor: Tensor in the autograd graph (typically dispatched_input).
+        tokens_per_expert: Output of dispatch_postprocess (per-expert counts).
+        local_balanced_token_count: Scalar float, num_local_tokens * moe_router_topk
+            (num_local_tokens from MoE forward hidden_states shape).
+        layer_number: Layer index (1-based).
+
+    Returns:
+        tensor unchanged.
+    """
+    return RecordDispatchTokenCountsFunction.apply(
+        tensor, tokens_per_expert, local_balanced_token_count, layer_number
+    )
+
+
+@deprecated(
+    version="0.16", removal_version="0.18", alternative="get_moe_metrics_tracker()._sync_metrics()"
+)
 def reduce_aux_losses_tracker_across_ranks(
     track_names: Optional[List[str]] = None, pg_collection: Optional[ProcessGroupCollection] = None
 ) -> None:
-    """Collect and reduce the auxiliary losses across ranks.
+    """Reduce the auxiliary losses across ranks.
 
     Args:
         track_names (Optional[List[str]], optional):
@@ -1021,40 +1097,28 @@ def reduce_aux_losses_tracker_across_ranks(
         pg_collection (Optional[ProcessGroupCollection], optional):
             The process group collection. Defaults to None.
     """
-    tracker = get_moe_layer_wise_logging_tracker()
-    if track_names is None:
-        track_names = tracker.keys()
-
-    if pg_collection is None:
-        # Use parallel_state groups
-        pp_group = parallel_state.get_pipeline_model_parallel_group()
-        dp_group = parallel_state.get_data_parallel_group(
-            with_context_parallel=False, partial_data_parallel=False
-        )
-    else:
-        pp_group = pg_collection.pp
-        dp_group = pg_collection.dp
-
-    for name in track_names:
-        values = tracker[name]["values"]
-        # TODO(Hepteract): delete the usage of the global parallel_state.
-        # Collect aux losses across PP.
-        torch.distributed.all_reduce(values, group=pp_group)
-        # Reduce aux losses across ranks.
-        if tracker[name].get('reduce_group') is not None:
-            torch.distributed.all_reduce(values, group=tracker[name].get('reduce_group'))
-            # Need to conduct reduction across data parallel ranks. When the reduce_group
-            # does not have 'dp' attribute, do it manually.
-            if not tracker[name].get('reduce_group_has_dp', False):
-                torch.distributed.all_reduce(
-                    values, group=dp_group, op=torch.distributed.ReduceOp.AVG
-                )
-        if tracker[name].get('avg_group') is not None:
-            torch.distributed.all_reduce(
-                values, group=tracker[name]['avg_group'], op=torch.distributed.ReduceOp.AVG
-            )
-
-
+    tracker = get_moe_metrics_tracker()
+    names_list = track_names if track_names is not None else list(tracker.metrics.keys())
+    tracker._sync_metrics(names_list, pg_collection)
+
+
+@deprecated(version="0.16", removal_version="0.18", alternative="get_moe_metrics_tracker().metrics")
+def get_moe_layer_wise_logging_tracker():
+    """Return the moe layer wise tracker in legacy dict format."""
+    return {
+        name: {
+            "values": entry.values,
+            "reduce_group": entry.reduce_group,
+            "avg_group": entry.avg_group,
+            "needs_dp_avg": entry.needs_dp_avg,
+        }
+        for name, entry in get_moe_metrics_tracker().metrics.items()
+    }
+
+
+@deprecated(
+    version="0.15", removal_version="0.17", alternative="get_moe_metrics_tracker().report()"
+)
 def track_moe_metrics(
     loss_scale: float,
     iteration: int,
@@ -1068,95 +1132,25 @@ def track_moe_metrics(
     moe_layer_freq: Optional[Union[int, List[int]]] = None,
     mtp_num_layers: Optional[int] = None,
     pg_collection: Optional[ProcessGroupCollection] = None,
-) -> None:
+) -> str:
     """Track the MoE metrics for logging.
 
-    Args:
-        loss_scale (float): The loss scale.
-        iteration (int): The iteration.
-        writer (SummaryWriter, optional): The tensorboard writer. Defaults to None.
-        wandb_writer (wandb.Run, optional): The wandb writer. Defaults to None.
-        total_loss_dict (dict[str, torch.Tensor], optional): The total loss dictionary.
-                                                             Defaults to None.
-        per_layer_logging (bool, optional): Whether to log per layer. Defaults to False.
-        force_initialize (bool, optional): Whether to force initialize the tracker.
-                                           Defaults to False.
-        track_names (List[str], optional): The names of the losses to track. Defaults to None.
-        num_layers (int, optional): The number of layers. Defaults to None.
-        moe_layer_freq (Union[int, List[int]], optional): The frequency of the MoE layers.
-                                                          Defaults to None.
-        mtp_num_layers (int, optional): The number of layers in the model parallel group.
-                                        Defaults to None.
-        pg_collection (ProcessGroupCollection, optional): The process group collection.
-                                                          Defaults to None.
+    Deprecated: Use get_moe_metrics_tracker().report() directly.
     """
-    # Aux loss logging
-    tracker = get_moe_layer_wise_logging_tracker()
-    # Initialize the tracker if force_initialize is True.
-    # The values tensor size must match what the router creates in save_to_aux_losses_tracker,
-    # which uses (num_layers + mtp_num_layers). This is important for PP ranks that have no
-    # MoE layers (so the tracker is empty and force_initialize creates the entry); their tensor
-    # size must match ranks that do have MoE layers, otherwise all_reduce across PP will hang.
-    tracker_num_layers = num_layers
-    if mtp_num_layers is not None:
-        tracker_num_layers += mtp_num_layers
-    if force_initialize:
-        if track_names is not None:
-            for key in track_names:
-                if key not in tracker:
-                    tracker[key] = {}
-                    tracker[key]["values"] = torch.zeros(tracker_num_layers, device="cuda")
-                    tracker[key]["reduce_group"] = None
-                    tracker[key]["avg_group"] = None
-                    tracker[key]["reduce_group_has_dp"] = False
-    reduce_aux_losses_tracker_across_ranks(track_names, pg_collection=pg_collection)
-
-    # Get number of MoE layers
-    if moe_layer_freq is None:
-        num_moe_layers = num_layers
-    elif isinstance(moe_layer_freq, int):
-        assert isinstance(num_layers, int)
-        moe_layer_pattern = [1 if (i % moe_layer_freq == 0) else 0 for i in range(num_layers)]
-        num_moe_layers = sum(moe_layer_pattern)
-    elif isinstance(moe_layer_freq, list):
-        num_moe_layers = sum(moe_layer_freq)
-    else:
-        raise ValueError(f"Invalid moe_layer_freq: {moe_layer_freq}")
-
-    if mtp_num_layers is not None:
-        num_moe_layers += mtp_num_layers
-
-    aux_losses = {k: v['values'].float() * loss_scale for k, v in tracker.items()}
-    for name, loss_list in aux_losses.items():
-        if total_loss_dict is not None:
-            if name not in total_loss_dict:
-                total_loss_dict[name] = loss_list.sum() / num_moe_layers
-            else:
-                total_loss_dict[name] += loss_list.sum() / num_moe_layers
-        if writer is not None:
-            # currently when using add_scalars,
-            # torch.utils.add_scalars makes each timer its own run, which
-            # polutes the runs list, so we just add each as a scalar
-            writer.add_scalar(name, loss_list.sum() / num_moe_layers, iteration)
-            if per_layer_logging:
-                for i, loss in enumerate(loss_list.tolist()):
-                    writer.add_scalar(f"moe/{name}_layer_{i}", loss, iteration)
-
-            # W&B logging lacks support for logging multiple scalars simultaneously.
-            # As a workaround, we log each scalar individually first, then we can create
-            # a custom panel to manually group them to a single plot.
-            if wandb_writer:
-                wandb_writer.log({f"{name}": loss_list.sum() / num_moe_layers}, iteration)
-                if per_layer_logging:
-                    wandb_writer.log(
-                        {
-                            f"moe/{name}_layer_{i}": loss
-                            for i, loss in enumerate(loss_list.tolist())
-                        },
-                        iteration,
-                    )
-
-    clear_aux_losses_tracker()
+    return get_moe_metrics_tracker().report(
+        loss_scale=loss_scale,
+        iteration=iteration,
+        writer=writer,
+        wandb_writer=wandb_writer,
+        per_layer_logging=per_layer_logging,
+        force_initialize=force_initialize,
+        track_names=track_names,
+        num_layers=num_layers,
+        moe_layer_freq=moe_layer_freq,
+        mtp_num_layers=mtp_num_layers,
+        pg_collection=pg_collection,
+        total_loss_dict=total_loss_dict,
+    )
 
 
 def get_updated_expert_bias(
@@ -1210,12 +1204,6 @@ def maybe_move_tensor_to_cpu(
     return tensor
 
 
-def get_moe_layer_wise_logging_tracker() -> dict:
-    """Return the moe layer wise tracker."""
-    global _MOE_LAYER_WISE_LOGGING_TRACKER
-    return _MOE_LAYER_WISE_LOGGING_TRACKER
-
-
 @internal_api
 class RandomSTE(torch.autograd.Function):
     """
@@ -1428,14 +1416,33 @@ def get_align_size_for_quantization(config: TransformerConfig) -> int:
     Returns:
         int: The alignment size for quantization.
     """
+    # CUTLASS kernel for grouped GEMM assumes 256 alignment.
+    if config.use_transformer_engine_op_fuser:
+        return 256
     if config.fp8:
         return get_fp8_align_size(config.fp8_recipe)
-    elif config.fp4:
+    if config.fp4:
         return get_fp4_align_size(config.fp4_recipe)
     # Only FP8 or FP4 requires padding. Defaults to 0.
     return 0
 
 
+def skip_routed_expert_padding(config: TransformerConfig) -> bool:
+    """Whether the expert module should skip quantization padding.
+
+    Returns True when padding is already applied by the router or the
+    HybridEP dispatcher.
+    """
+    if config.moe_router_padding_for_quantization:
+        return True
+    if (
+        config.moe_token_dispatcher_type == "flex"
+        and config.moe_flex_dispatcher_backend == "hybridep"
+    ):
+        return True
+    return False
+
+
 # TODO(Hepteract): delete the usage of the global parallel_state.
 # Initialize process groups with the global parallel_state.
 def get_default_pg_collection() -> ProcessGroupCollection:
@@ -1497,12 +1504,7 @@ def get_early_return_outputs(
             outputs = [self.kwargs['hidden_states'], self.kwargs['probs']]
             valid_cudagraph_attrs = []
             for attr_name in self.moe_layer.token_dispatcher.cudagraph_attrs:
-                hier_attr_name = attr_name.split('.')
-                attr = self.moe_layer.token_dispatcher
-                for name in hier_attr_name:
-                    attr = getattr(attr, name, None)
-                    if attr is None:
-                        break
+                attr = self.moe_layer.token_dispatcher.get_cudagraph_attr(attr_name)
                 if isinstance(attr, torch.Tensor):
                     outputs.append(attr)
                     valid_cudagraph_attrs.append(attr_name)
diff --git a/megatron/core/transformer/moe/paged_stash.py b/megatron/core/transformer/moe/paged_stash.py
new file mode 100644
index 00000000000..03cfba7c1aa
--- /dev/null
+++ b/megatron/core/transformer/moe/paged_stash.py
@@ -0,0 +1,1406 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import logging
+from contextlib import nullcontext
+from typing import Any
+
+import torch
+import triton
+import triton.language as tl
+
+from megatron.core._rank_utils import log_single_rank
+from megatron.core.full_cuda_graph import FullCudaGraphWrapper
+from megatron.core.optimizer.distrib_optimizer import DistributedOptimizer
+from megatron.core.utils import get_attr_wrapped_model
+
+logger = logging.getLogger(__name__)
+
+GLOBAL_BLOCK_SIZE = 1024
+SCALE_INV_BLOCK_SIZE = 32
+
+
+class PagedStashBuffer:
+    """
+    A paged stash buffer with page-level memory management.
+    Supports both CUDA and optional pinned host buffer for overflow fallback.
+
+    Buffers are organized as [num_pages, page_size, hidden_size].
+    Uses per-buffer free lists (circular buffer) tracked as two-element state: [0]=CUDA, [1]=host.
+    """
+
+    def __init__(
+        self,
+        num_tokens,
+        hidden_size,
+        page_size,
+        device,
+        overflow,
+        host_spill,
+        dtype,
+        num_tokens_host=0,
+    ):
+        """
+        Args:
+            num_tokens: Maximum number of tokens the CUDA buffer can hold
+            hidden_size: Hidden dimension size
+            page_size: Number of tokens per page
+            device: Device for the buffer
+            overflow: Overflow flag tensor (shared across all buffers)
+            host_spill: Global flag set to 1 if any stash used pinned host (shared)
+            dtype: Data type
+            num_tokens_host: If > 0, allocate pinned host buffer with this many tokens for
+                spillover.
+        """
+        self.hidden_size = hidden_size
+        self.page_size = page_size
+        self.device = device
+        self.dtype = dtype
+        self.overflow = overflow  # GPU flag (shared)
+        self.host_spill = host_spill
+
+        # CUDA buffer
+        self.num_cuda_pages = (num_tokens + page_size - 1) // page_size
+        self.total_cuda_tokens = self.num_cuda_pages * page_size
+        self.cuda_buffer = torch.empty(
+            (self.total_cuda_tokens, hidden_size), dtype=dtype, device=device
+        )
+
+        # Host buffer (pinned), optional
+        self.num_host_pages = (
+            (num_tokens_host + page_size - 1) // page_size if num_tokens_host > 0 else 0
+        )
+        self.total_host_tokens = self.num_host_pages * page_size if self.num_host_pages > 0 else 0
+        if self.num_host_pages > 0:
+            self.host_buffer = torch.empty(
+                (self.total_host_tokens, hidden_size), dtype=dtype, device='cpu', pin_memory=True
+            )
+        else:
+            self.host_buffer = None
+
+        # Free list state: shape (2,) index 0 = CUDA, 1 = host (all in device memory for kernel)
+        self.free_list_head = torch.zeros(2, dtype=torch.int64, device=device)
+        self.free_list_tail = torch.tensor(
+            [self.num_cuda_pages, self.num_host_pages], dtype=torch.int64, device=device
+        )
+        self.free_list_capacity = torch.tensor(
+            [self.num_cuda_pages, self.num_host_pages], dtype=torch.int64, device=device
+        )
+
+        # Free list arrays (device memory): page IDs for each buffer
+        self.free_list_cuda = torch.arange(self.num_cuda_pages, dtype=torch.int64, device=device)
+        if self.num_host_pages > 0:
+            self.free_list_host = torch.arange(
+                self.num_host_pages, dtype=torch.int64, device=device
+            )
+        else:
+            self.free_list_host = torch.empty(0, dtype=torch.int64, device=device)
+
+        # Pre-allocated reset values (CUDA graph safe: no allocation in reset())
+        self._reset_tail = torch.tensor(
+            [self.num_cuda_pages, self.num_host_pages], dtype=torch.int64, device=device
+        )
+        self._reset_free_list_cuda = torch.arange(
+            self.num_cuda_pages, dtype=torch.int64, device=device
+        )
+        if self.num_host_pages > 0:
+            self._reset_free_list_host = torch.arange(
+                self.num_host_pages, dtype=torch.int64, device=device
+            )
+        else:
+            self._reset_free_list_host = None
+
+    def reset(self):
+        """Reset both CUDA and host free lists (CUDA graph safe: no new allocations)."""
+        self.free_list_cuda.copy_(self._reset_free_list_cuda)
+        self.free_list_head.zero_()
+        self.free_list_tail.copy_(self._reset_tail)
+        if self._reset_free_list_host is not None:
+            self.free_list_host.copy_(self._reset_free_list_host)
+
+    def __repr__(self):
+        return (
+            f"PagedStashBuffer(num_cuda_pages={self.num_cuda_pages}, "
+            f"num_host_pages={self.num_host_pages}, "
+            f"page_size={self.page_size}, hidden_size={self.hidden_size}, "
+            f"device={self.device}, dtype={self.dtype})"
+        )
+
+
+@triton.jit
+def _paged_stash_copy_kernel(
+    src_ptr,
+    cuda_dst_ptr,
+    host_dst_ptr,
+    num_tokens_ptr,
+    free_list_cuda_ptr,
+    free_list_host_ptr,
+    free_list_head_ptr,  # shape (2,): [cuda_head, host_head]
+    free_list_tail_ptr,  # shape (2,)
+    free_list_capacity_ptr,
+    page_record_ptr,
+    overflow_ptr,
+    host_spill_global_ptr,  # 1 if any successful host spill (not set on overflow path)
+    spilled_to_host_ptr,  # Output: 0 = stored in CUDA, 1 = stored in host or overflow
+    new_free_list_head_ptr,  # Output: shape (2,) updated heads
+    PAGE_SIZE: tl.constexpr,
+    HIDDEN_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+    HAS_HOST_BUFFER: tl.constexpr,
+):
+    """Copy tokens to paged stash: try CUDA first (fast path), then host if CUDA full."""
+    pid = tl.program_id(axis=0)
+    num_blocks = tl.num_programs(axis=0)
+
+    # Load overflow first (get in flight early); branch on it only before any write
+    overflow = tl.load(overflow_ptr)
+
+    num_tokens = tl.load(num_tokens_ptr)
+    required_pages = tl.cdiv(num_tokens, PAGE_SIZE)
+
+    # Common case: load only CUDA state (and head_host for output when use_cuda)
+    head_cuda = tl.load(free_list_head_ptr)
+    head_host = tl.load(free_list_head_ptr + 1)
+    tail_cuda = tl.load(free_list_tail_ptr)
+    cap_cuda = tl.load(free_list_capacity_ptr)
+
+    avail_cuda = tail_cuda - head_cuda
+    use_cuda = avail_cuda >= required_pages
+
+    # Assume CUDA path: set everything for GPU stash
+    spill = 0
+    dst_ptr = cuda_dst_ptr
+    free_list_ptr = free_list_cuda_ptr
+    head = head_cuda
+    cap = cap_cuda
+    new_head_cuda = head_cuda + required_pages
+    new_head_host = head_host
+
+    if overflow == 1:
+        # No stash; preserve heads so Python copy_ does not write garbage into the buffer.
+        if pid == 0:
+            tl.store(new_free_list_head_ptr, head_cuda)
+            tl.store(new_free_list_head_ptr + 1, head_host)
+        return
+
+    # Only when CUDA is full: load host state and maybe switch to host
+    if not use_cuda:
+        tail_host = tl.load(free_list_tail_ptr + 1)
+        cap_host = tl.load(free_list_capacity_ptr + 1)
+        use_host = HAS_HOST_BUFFER == 1 and (tail_host - head_host) >= required_pages
+        if use_host:
+            spill = 1
+            dst_ptr = host_dst_ptr
+            free_list_ptr = free_list_host_ptr
+            head = head_host
+            cap = cap_host
+            new_head_cuda = head_cuda
+            new_head_host = head_host + required_pages
+        else:
+            if pid == 0:
+                tl.store(overflow_ptr, 1)
+                tl.store(spilled_to_host_ptr, 1)
+                tl.store(new_free_list_head_ptr, head_cuda)
+                tl.store(new_free_list_head_ptr + 1, head_host)
+            return
+
+    if pid == 0:
+        tl.store(spilled_to_host_ptr, spill)
+        if spill == 1:
+            tl.store(host_spill_global_ptr, 1)
+
+    # Copy loop: strided over tokens
+    token_idx = pid
+    while token_idx < num_tokens:
+        page_slot = token_idx // PAGE_SIZE
+        token_in_page = token_idx % PAGE_SIZE
+        free_list_idx = (head + page_slot) % cap
+        page_id = tl.load(free_list_ptr + free_list_idx)
+        if token_in_page == 0:
+            tl.store(page_record_ptr + page_slot, page_id)
+        dst_token_idx = page_id * PAGE_SIZE + token_in_page
+
+        elements_per_thread = HIDDEN_SIZE // BLOCK_SIZE
+        need_mask = (HIDDEN_SIZE % BLOCK_SIZE) != 0
+        num_iters = elements_per_thread + (1 if need_mask else 0)
+        token_idx_i64 = token_idx.to(tl.int64)
+        dst_token_idx_i64 = dst_token_idx.to(tl.int64)
+        src_base = src_ptr + token_idx_i64 * HIDDEN_SIZE
+        dst_base = dst_ptr + dst_token_idx_i64 * HIDDEN_SIZE
+
+        if need_mask:
+            for iter in range(num_iters):
+                hidden_offsets = tl.arange(0, BLOCK_SIZE) + iter * BLOCK_SIZE
+                hidden_mask = hidden_offsets < HIDDEN_SIZE
+                data = tl.load(src_base + hidden_offsets, mask=hidden_mask, other=0)
+                tl.store(dst_base + hidden_offsets, data, mask=hidden_mask)
+        else:
+            for iter in range(elements_per_thread):
+                hidden_offsets = tl.arange(0, BLOCK_SIZE) + iter * BLOCK_SIZE
+                data = tl.load(src_base + hidden_offsets)
+                tl.store(dst_base + hidden_offsets, data)
+        token_idx += num_blocks
+
+    if pid == 0:
+        tl.store(new_free_list_head_ptr, new_head_cuda)
+        tl.store(new_free_list_head_ptr + 1, new_head_host)
+
+
+@triton.jit
+def _paged_stash_pop_kernel(
+    cuda_src_ptr,
+    host_src_ptr,
+    dst_ptr,
+    num_tokens_ptr,
+    page_record_ptr,
+    spilled_to_host_ptr,  # 0 = read from CUDA, 1 = read from host
+    overflow_ptr,
+    free_list_cuda_ptr,
+    free_list_host_ptr,
+    free_list_tail_ptr,  # shape (2,)
+    free_list_capacity_ptr,
+    new_free_list_tail_ptr,  # Output: shape (2,) updated tails
+    PAGE_SIZE: tl.constexpr,
+    HIDDEN_SIZE: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    """Reload tokens from paged stash; CUDA path fast, host path when spilled_to_host."""
+    pid = tl.program_id(axis=0)
+    num_blocks = tl.num_programs(axis=0)
+
+    # Load overflow first (get in flight early); branch on it only before any write
+    overflow = tl.load(overflow_ptr)
+
+    num_tokens = tl.load(num_tokens_ptr)
+    spill = tl.load(spilled_to_host_ptr)
+    required_pages = tl.cdiv(num_tokens, PAGE_SIZE)
+
+    # Common case: load only CUDA state (and tail_host for output when spill=0)
+    tail_cuda = tl.load(free_list_tail_ptr)
+    tail_host = tl.load(free_list_tail_ptr + 1)
+    cap_cuda = tl.load(free_list_capacity_ptr)
+
+    if overflow == 1:
+        # No pop; preserve tails so Python copy_ does not write garbage into the buffer.
+        if pid == 0:
+            tl.store(new_free_list_tail_ptr, tail_cuda)
+            tl.store(new_free_list_tail_ptr + 1, tail_host)
+        return
+
+    # Assume CUDA path
+    src_ptr = cuda_src_ptr
+    free_list_ptr = free_list_cuda_ptr
+    tail = tail_cuda
+    cap = cap_cuda
+    new_tail_cuda = tail_cuda + required_pages
+    new_tail_host = tail_host
+
+    # Only when spilled to host: load host state and switch
+    if spill == 1:
+        cap_host = tl.load(free_list_capacity_ptr + 1)
+        if cap_host == 0:
+            # Cannot pop from host; preserve tails (no-op for free-list state).
+            if pid == 0:
+                tl.store(new_free_list_tail_ptr, tail_cuda)
+                tl.store(new_free_list_tail_ptr + 1, tail_host)
+            return
+        src_ptr = host_src_ptr
+        free_list_ptr = free_list_host_ptr
+        tail = tail_host
+        cap = cap_host
+        new_tail_cuda = tail_cuda
+        new_tail_host = tail_host + required_pages
+
+    token_idx = pid
+    while token_idx < num_tokens:
+        page_slot = token_idx // PAGE_SIZE
+        token_in_page = token_idx % PAGE_SIZE
+        page_id = tl.load(page_record_ptr + page_slot)
+        src_token_idx = page_id * PAGE_SIZE + token_in_page
+
+        elements_per_thread = HIDDEN_SIZE // BLOCK_SIZE
+        need_mask = (HIDDEN_SIZE % BLOCK_SIZE) != 0
+        num_iters = elements_per_thread + (1 if need_mask else 0)
+        src_token_idx_i64 = src_token_idx.to(tl.int64)
+        token_idx_i64 = token_idx.to(tl.int64)
+        src_base = src_ptr + src_token_idx_i64 * HIDDEN_SIZE
+        dst_base = dst_ptr + token_idx_i64 * HIDDEN_SIZE
+
+        if need_mask:
+            for iter in range(num_iters):
+                hidden_offsets = tl.arange(0, BLOCK_SIZE) + iter * BLOCK_SIZE
+                hidden_mask = hidden_offsets < HIDDEN_SIZE
+                data = tl.load(src_base + hidden_offsets, mask=hidden_mask, other=0)
+                tl.store(dst_base + hidden_offsets, data, mask=hidden_mask)
+        else:
+            for iter in range(elements_per_thread):
+                hidden_offsets = tl.arange(0, BLOCK_SIZE) + iter * BLOCK_SIZE
+                data = tl.load(src_base + hidden_offsets)
+                tl.store(dst_base + hidden_offsets, data)
+
+        if token_in_page == 0:
+            write_idx = (tail + page_slot) % cap
+            tl.store(free_list_ptr + write_idx, page_id)
+        token_idx += num_blocks
+
+    if pid == 0:
+        tl.store(new_free_list_tail_ptr, new_tail_cuda)
+        tl.store(new_free_list_tail_ptr + 1, new_tail_host)
+
+
+class PagedTensor:
+    """
+    A paged tensor that stores data in pages within a paged stash buffer.
+    """
+
+    def __init__(
+        self,
+        tensor,
+        num_tokens_tensor=None,
+        avg_num_tokens: int = None,
+        vp_stage=None,
+        original_shape=None,
+        schedule_layer_no=None,
+        is_columnwise_scale_inv=None,
+        max_num_tokens=None,
+        hidden_size=None,
+        page_size=64,
+    ):
+        """
+        Args:
+            tensor: The tensor to store
+            num_tokens_tensor: Scalar tensor containing actual number of tokens
+            vp_stage: Virtual pipeline stage
+            layer_name: Name of the layer
+            max_num_tokens: Maximum number of tokens
+            hidden_size: Hidden size
+            page_size: Number of tokens per page
+        """
+        self._tensor = tensor
+        self._original_tensor = None
+        assert (
+            num_tokens_tensor is not None
+            and isinstance(num_tokens_tensor, torch.Tensor)
+            and num_tokens_tensor.numel() == 1
+        )
+        self.num_tokens_tensor = num_tokens_tensor.clone()
+        self.avg_num_tokens = avg_num_tokens
+        self.vp_stage = vp_stage
+        self.schedule_layer_no = schedule_layer_no
+        self.is_columnwise_scale_inv = is_columnwise_scale_inv
+        self.max_num_tokens = max_num_tokens
+        self.hidden_size = hidden_size
+        self.page_size = page_size
+
+        # Original tensor information
+        self.original_shape = list(tensor.shape) if original_shape is None else original_shape
+        self.element_size = tensor.element_size()
+        self.dtype = tensor.dtype
+        self.device = tensor.device
+
+        # Calculate number of pages needed
+        self.max_num_pages = (self.max_num_tokens + page_size - 1) // page_size  # Ceiling division
+
+        # Page record: stores which pages are being used for this tensor
+        self.page_record = torch.zeros(self.max_num_pages, dtype=torch.int64, device=self.device)
+        # Set by copy kernel: 0 = data in CUDA stash, 1 = data in host (pinned) stash
+        self.spilled_to_host = torch.zeros(1, dtype=torch.int64, device=self.device)
+
+    @property
+    def schedule_layer(self):
+        """Get the schedule layer."""
+        return self.schedule_layer_no
+
+    def offload_to_stash(self, paged_stash_buffer: PagedStashBuffer, max_blocks=2048):
+        """Offload the paged tensor to paged stash buffer (CUDA or host if CUDA full)."""
+        self._tensor = self._tensor.contiguous()
+        if self.num_tokens_tensor.dim() == 0:
+            self.num_tokens_tensor = self.num_tokens_tensor.reshape(1)
+        if self.is_columnwise_scale_inv:
+            num_tokens_tensor = self.num_tokens_tensor // SCALE_INV_BLOCK_SIZE
+            max_num_tokens = self.max_num_tokens // SCALE_INV_BLOCK_SIZE
+        else:
+            num_tokens_tensor = self.num_tokens_tensor
+            max_num_tokens = self.max_num_tokens
+
+        tensor_to_copy = self._tensor
+        BLOCK_SIZE = GLOBAL_BLOCK_SIZE
+        num_blocks = min(max_num_tokens, max_blocks)
+        grid = (num_blocks,)
+
+        new_free_list_head = torch.empty(2, dtype=torch.int64, device=self.device)
+        has_host = 1 if paged_stash_buffer.host_buffer is not None else 0
+        host_dst = (
+            paged_stash_buffer.host_buffer
+            if paged_stash_buffer.host_buffer is not None
+            else paged_stash_buffer.cuda_buffer
+        )
+
+        _paged_stash_copy_kernel[grid](
+            tensor_to_copy.view(paged_stash_buffer.cuda_buffer.dtype),
+            paged_stash_buffer.cuda_buffer,
+            host_dst,
+            num_tokens_tensor,
+            paged_stash_buffer.free_list_cuda,
+            paged_stash_buffer.free_list_host,
+            paged_stash_buffer.free_list_head,
+            paged_stash_buffer.free_list_tail,
+            paged_stash_buffer.free_list_capacity,
+            self.page_record,
+            paged_stash_buffer.overflow,
+            paged_stash_buffer.host_spill,
+            self.spilled_to_host,
+            new_free_list_head,
+            PAGE_SIZE=self.page_size,
+            HIDDEN_SIZE=self.hidden_size,
+            BLOCK_SIZE=BLOCK_SIZE,
+            HAS_HOST_BUFFER=has_host,
+        )
+        paged_stash_buffer.free_list_head.copy_(new_free_list_head)
+        self._original_tensor = self._tensor
+        self._tensor = None
+
+    def reload_from_stash(self, paged_stash_buffer: PagedStashBuffer, max_blocks=2048):
+        """Reload the paged tensor from paged stash buffer (CUDA or host from spilled_to_host)."""
+        self._tensor = torch.empty(self.original_shape, dtype=self.dtype, device=self.device)
+        tensor_to_reload = self._tensor
+
+        if self.is_columnwise_scale_inv:
+            num_tokens_tensor = self.num_tokens_tensor // SCALE_INV_BLOCK_SIZE
+            max_num_tokens = self.max_num_tokens // SCALE_INV_BLOCK_SIZE
+        else:
+            num_tokens_tensor = self.num_tokens_tensor
+            max_num_tokens = self.max_num_tokens
+        BLOCK_SIZE = GLOBAL_BLOCK_SIZE
+        num_blocks = min(max_num_tokens, max_blocks)
+        grid = (num_blocks,)
+
+        new_free_list_tail = torch.empty(2, dtype=torch.int64, device=self.device)
+        host_src = (
+            paged_stash_buffer.host_buffer
+            if paged_stash_buffer.host_buffer is not None
+            else paged_stash_buffer.cuda_buffer
+        )
+        _paged_stash_pop_kernel[grid](
+            paged_stash_buffer.cuda_buffer,
+            host_src,
+            tensor_to_reload.view(paged_stash_buffer.cuda_buffer.dtype),
+            num_tokens_tensor,
+            self.page_record,
+            self.spilled_to_host,
+            paged_stash_buffer.overflow,
+            paged_stash_buffer.free_list_cuda,
+            paged_stash_buffer.free_list_host,
+            paged_stash_buffer.free_list_tail,
+            paged_stash_buffer.free_list_capacity,
+            new_free_list_tail,
+            PAGE_SIZE=self.page_size,
+            HIDDEN_SIZE=self.hidden_size,
+            BLOCK_SIZE=BLOCK_SIZE,
+        )
+
+        paged_stash_buffer.free_list_tail.copy_(new_free_list_tail)
+
+
+class PipelinePreScheduleFunction(torch.autograd.Function):
+    """
+    This function is used to update the pp schedule.
+    """
+
+    @staticmethod
+    def forward(ctx, tensor, stash_manager):  # after forward
+        # pylint: disable=missing-function-docstring
+        ctx.stash_manager = stash_manager
+        # Wait for stash to complete before starting the next layer
+        stash_manager.wait_for_stash_to_complete()
+        return tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):  # before backward
+        # pylint: disable=missing-function-docstring
+        # Initiate reload for next layer
+        if (
+            ctx.stash_manager.status == 'captured'
+            and ctx.stash_manager.current_schedule_index < len(ctx.stash_manager._pp_schedule)
+        ):
+            next_schedule_layer = ctx.stash_manager._pp_schedule[
+                ctx.stash_manager.current_schedule_index
+            ]
+            if next_schedule_layer < 0:
+                ctx.stash_manager.reload_paged_tensors(-next_schedule_layer)
+
+        return grad_output + (None, None)
+
+
+class PipelinePostScheduleFunction(torch.autograd.Function):
+    """
+    This function is used to update the pp schedule.
+    """
+
+    @staticmethod
+    def forward(ctx, tensor, stash_manager):  # after forward
+        # pylint: disable=missing-function-docstring
+        ctx.stash_manager = stash_manager
+        ctx.vp_stage = stash_manager.current_vp_stage
+        if ctx.vp_stage is None:
+            ctx.vp_stage = 0
+        ctx.layer_no, ctx.microbatch_no = stash_manager.update_pp_schedule(ctx.vp_stage + 1)
+
+        # Initiate stash for current layer and reload for next layer
+        if stash_manager.status == 'captured':
+            current_schedule_layer = stash_manager.get_schedule_layer(
+                ctx.vp_stage + 1, ctx.layer_no, ctx.microbatch_no
+            )
+            next_schedule_layer = ctx.stash_manager._pp_schedule[
+                ctx.stash_manager.current_schedule_index + 1
+            ]
+            if current_schedule_layer != -next_schedule_layer:
+                # Start stash for current layer
+                ctx.stash_manager.stash_paged_tensors(current_schedule_layer)
+                if next_schedule_layer < 0:
+                    # reload for next backward layer
+                    ctx.stash_manager.reload_paged_tensors(-next_schedule_layer, no_wait=True)
+            else:
+                ctx.stash_manager.remove_paged_tensor_from_stash()
+
+        ctx.stash_manager.current_schedule_index += 1
+        # return the identical tensor
+        return tensor
+
+    @staticmethod
+    def backward(ctx, *grad_output):  # before backward
+        # pylint: disable=missing-function-docstring
+        if ctx.vp_stage is not None:
+            ctx.stash_manager.update_pp_schedule(
+                -(ctx.vp_stage + 1), -ctx.layer_no, -ctx.microbatch_no
+            )
+        ctx.stash_manager.current_schedule_index += 1
+        current_stream = torch.cuda.current_stream()
+
+        ctx.stash_manager.wait_for_stash_to_complete()
+        if ctx.stash_manager._unpack_stream_status == 'reloading':
+            current_stream.wait_stream(ctx.stash_manager.unpack_stream)
+            ctx.stash_manager._unpack_stream_status = 'idle'
+
+        return grad_output + (None, None)
+
+
+class PagedStashManager:
+    """
+    Singleton manager for coordinating paged stashing across pipeline stages.
+    Manages chunk handlers, synchronizes GPU-GPU transfers,
+    and handles virtual pipeline parallelism
+    """
+
+    STASH_MGR = None
+
+    @classmethod
+    def get_instance(cls):
+        """Get the singleton instance of PagedStashManager."""
+        if cls.STASH_MGR is None:
+            cls.STASH_MGR = PagedStashManager()
+        return cls.STASH_MGR
+
+    def __init__(self):
+        """Initialize the manager with queues and dedicated CUDA streams."""
+        # allocate streams and events for synchronization
+        self.enabled = False
+        self._pack_stream = torch.cuda.Stream()
+        # Currently paged stashing is not stream-safe, so use the same stream for packing
+        # and unpacking
+        self._unpack_stream = self._pack_stream
+        self._pack_stream_status = 'idle'  # idle, stashing
+        self._unpack_stream_status = 'idle'  # idle, reloading
+        self.paged_tensors_to_stash = []
+        self.paged_tensors_stash_in_progress = []
+        self.paged_tensors_to_reload = {}
+
+        self.iteration = 0
+        self._current_layer_name = None
+        self.vp_size = None
+        self.current_vp_stage = None
+        self.status = 'begin'  # begin, capture, captured
+        # If element is +ve, it denotes forward pass of vp stage,
+        # if -ve, it denotes backward pass of vp stage
+        self._pp_schedule = None
+        self.current_layer = None
+        self.current_microbatch = None
+        self.current_schedule_index = None
+
+        # Track max tokens needed across all vp_stages grouped by dtype and hidden_size
+        self.max_tokens_across_vp_stages = None
+        self.temp_tokens_across_vp_stages = None
+        # Track max tokens computed from avg_num_tokens (heuristic) across all vp_stages
+        self.max_avg_tokens_across_vp_stages = None
+        self.temp_avg_tokens_across_vp_stages = None
+
+        self.num_tokens_tensor = None
+        self.max_num_tokens = None
+        # Optional hint: expected/average number of tokens (e.g., pre-padding estimate)
+        self.avg_num_tokens = None
+        self.stash_buffers = None
+        self.overflow = None
+        self.host_spill = None
+        self.device = None
+
+        # Page size for paged memory (default; overwritten from config in paged_stash_reset)
+        self.page_size = 64
+
+    @property
+    def pack_stream(self):
+        """Get the pack stream."""
+        return self._pack_stream
+
+    @property
+    def unpack_stream(self):
+        """Get the unpack stream."""
+        return self._unpack_stream
+
+    def set_current_layer_name(self, name):
+        """Set the current layer name."""
+        self._current_layer_name = name
+
+    def get_schedule_layer(self, vp_stage, layer_no, microbatch_no):
+        """Get the schedule layer."""
+        assert layer_no < 1000 and microbatch_no < 1000, "Schedule encoding overflow"
+        return vp_stage * 1000000 + layer_no * 1000 + microbatch_no
+
+    def add_paged_tensor_to_stash(self, paged_tensor):
+        """Add a paged tensor to the stash list."""
+        if self.status == 'captured':
+            self.paged_tensors_to_stash.append(paged_tensor)
+        else:
+            pass
+
+    def remove_paged_tensor_from_stash(self):
+        """Remove all paged tensors from the stash list."""
+        if self.status == 'captured':
+            self.paged_tensors_to_stash.clear()
+        else:
+            pass
+
+    def stash_paged_tensors(self, pp_schedule_layer):
+        """Stash the paged tensors."""
+        current_stream = torch.cuda.current_stream()
+        self.pack_stream.wait_stream(current_stream)
+
+        with torch.cuda.stream(self.pack_stream):
+            if self.status == 'captured':
+                self._pack_stream_status = 'stashing'
+                if pp_schedule_layer not in self.paged_tensors_to_reload:
+                    self.paged_tensors_to_reload[pp_schedule_layer] = []
+                assert len(self.paged_tensors_to_reload[pp_schedule_layer]) == 0, (
+                    f"paged_tensors_to_reload {pp_schedule_layer} is not empty "
+                    f"{self.paged_tensors_to_reload[pp_schedule_layer]}"
+                )
+                while len(self.paged_tensors_to_stash) > 0:
+                    paged_tensor = self.paged_tensors_to_stash.pop(0)
+                    stash_buffer = self.stash_buffers[paged_tensor.dtype][paged_tensor.hidden_size]
+                    paged_tensor.offload_to_stash(stash_buffer)
+                    self.paged_tensors_to_reload[pp_schedule_layer].append(paged_tensor)
+                    self.paged_tensors_stash_in_progress.append(paged_tensor)
+            else:
+                pass
+        assert (
+            len(self.paged_tensors_to_stash) == 0
+        ), f"paged_tensors_to_stash is not empty {self.paged_tensors_to_stash}"
+
+    def wait_for_stash_to_complete(self):
+        """Wait for stash to complete."""
+        current_stream = torch.cuda.current_stream()
+        if self._pack_stream_status == 'stashing':
+            current_stream.wait_stream(self.pack_stream)
+            self._pack_stream_status = 'idle'
+
+            # Deallocate original tensor after stash is complete
+            while len(self.paged_tensors_stash_in_progress) > 0:
+                paged_tensor = self.paged_tensors_stash_in_progress.pop(0)
+                paged_tensor._original_tensor = None
+
+    def reload_paged_tensors(self, pp_schedule_layer, no_wait=False):
+        """Reload the paged tensors."""
+        # Avoid waiting for main stream if reload is immediately after stash
+        # since stash is already waiting for main stream
+        if not no_wait or self.unpack_stream != self.pack_stream:
+            current_stream = torch.cuda.current_stream()
+            self.unpack_stream.wait_stream(current_stream)
+
+        with torch.cuda.stream(self.unpack_stream):
+            if self.status == 'captured':
+                self._unpack_stream_status = 'reloading'
+                while len(self.paged_tensors_to_reload[pp_schedule_layer]) > 0:
+                    paged_tensor = self.paged_tensors_to_reload[pp_schedule_layer].pop(0)
+                    stash_buffer = self.stash_buffers[paged_tensor.dtype][paged_tensor.hidden_size]
+                    paged_tensor.reload_from_stash(stash_buffer)
+            else:
+                pass
+            assert len(self.paged_tensors_to_reload[pp_schedule_layer]) == 0, (
+                f"paged_tensors_to_reload {pp_schedule_layer} is not empty "
+                f"{self.paged_tensors_to_reload[pp_schedule_layer]}"
+            )
+
+    def allocate_stash_buffers(
+        self,
+        moe_paged_stash_buffer_size_factor_cuda: float = 1.10,
+        moe_paged_stash_buffer_size_factor_cpu: float = 0.0,
+    ):
+        """Allocate stash buffers organized by [dtype][hidden_size]."""
+        self.stash_buffers = {}
+        if self.overflow is None:
+            self.overflow = torch.zeros(1, dtype=torch.int64, device=self.device)
+        else:
+            self.overflow.zero_()
+        if self.host_spill is None:
+            self.host_spill = torch.zeros(1, dtype=torch.int64, device=self.device)
+        else:
+            self.host_spill.zero_()
+
+        cuda_factor = moe_paged_stash_buffer_size_factor_cuda
+        cpu_factor = moe_paged_stash_buffer_size_factor_cpu
+
+        # Both factors use the same sign convention:
+        # - positive: size based on avg_num_tokens-derived maxima
+        # - negative: size based on actual num_tokens-derived maxima (legacy behavior)
+        # Scale is always abs(factor). For CPU, 0 means no host buffer.
+        if cuda_factor >= 0:
+            max_tokens_dict = self.max_avg_tokens_across_vp_stages
+            cuda_scale = cuda_factor
+        else:
+            max_tokens_dict = self.max_tokens_across_vp_stages
+            cuda_scale = -cuda_factor
+
+        # Fallback safety: if avg-based dict is not available/populated yet, use actual-max dict.
+        if not max_tokens_dict:
+            max_tokens_dict = self.max_tokens_across_vp_stages
+
+        if cpu_factor > 0:
+            host_tokens_dict = (
+                self.max_avg_tokens_across_vp_stages or self.max_tokens_across_vp_stages
+            )
+            cpu_scale = cpu_factor
+        elif cpu_factor < 0:
+            host_tokens_dict = self.max_tokens_across_vp_stages
+            cpu_scale = -cpu_factor
+        else:
+            host_tokens_dict = None
+            cpu_scale = 0.0
+
+        if max_tokens_dict is None:
+            log_single_rank(
+                logger,
+                logging.INFO,
+                "Paged stash: max_tokens_dict is None, skipping stash buffer allocation",
+            )
+            return
+        for dtype, hidden_size in max_tokens_dict:
+            if dtype not in self.stash_buffers:
+                self.stash_buffers[dtype] = {}
+            assert hidden_size not in self.stash_buffers[dtype]
+            num_tokens = int(max_tokens_dict[dtype, hidden_size] * cuda_scale)
+            num_tokens_host = (
+                int(host_tokens_dict[dtype, hidden_size] * cpu_scale)
+                if host_tokens_dict is not None and (dtype, hidden_size) in host_tokens_dict
+                else 0
+            )
+            buf_dtype = (
+                torch.uint8 if dtype in [torch.float8_e4m3fn, torch.float8_e8m0fnu] else dtype
+            )
+            self.stash_buffers[dtype][hidden_size] = PagedStashBuffer(
+                num_tokens,
+                hidden_size,
+                self.page_size,
+                self.device,
+                self.overflow,
+                self.host_spill,
+                buf_dtype,
+                num_tokens_host=num_tokens_host,
+            )
+            sb = self.stash_buffers[dtype][hidden_size]
+            msg = f'allocate_stash_buffers cuda: {sb.cuda_buffer.shape}'
+            if sb.host_buffer is not None:
+                msg += f' host: {sb.host_buffer.shape}'
+            msg += f' dtype={sb.dtype} ({dtype})'
+            log_single_rank(logger, logging.INFO, msg)
+
+    def release_stash_buffers(self):
+        """Drop large stash CUDA/host page buffers after full-iteration CUDA graph teardown
+        (fallback).
+
+        Shared ``overflow`` / ``host_spill`` scalars are retained (small). Reallocation of page
+        buffers happens on the next ``paged_stash_reset`` while status remains ``captured``.
+        """
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+        self.stash_buffers = None
+        log_single_rank(
+            logger,
+            logging.INFO,
+            (
+                "Paged stash: released stash page buffers after fallback "
+                "(reallocated on next stash reset)."
+            ),
+        )
+
+    def update_pp_schedule(self, vp_stage, layer_no=None, microbatch_no=None):
+        """Update the pp schedule."""
+        if self._pp_schedule is None:
+            self._pp_schedule = []
+
+        assert self.vp_size is not None
+        if layer_no is None:
+            # forward pass
+            vp_stage_index = vp_stage - 1
+            layer_no = self.current_layer[vp_stage_index]
+            self.current_layer[vp_stage_index] += 1
+            microbatch_no = self.current_microbatch[vp_stage_index]
+
+        if self.status == 'capture':
+            self._pp_schedule.append(self.get_schedule_layer(vp_stage, layer_no, microbatch_no))
+
+        expected = self.get_schedule_layer(vp_stage, layer_no, microbatch_no)
+        actual = self._pp_schedule[self.current_schedule_index]
+        assert actual == expected, f"schedule {actual} != {expected}"
+
+        return layer_no, microbatch_no
+
+    def update_model_chunk(self, vp_stage_index):
+        """Update layer=1, increment microbatch of new vp vp_stage."""
+        if self.current_layer is None:
+            # current layer and microbatch for each vp stage for forward pass
+            self.current_layer = [1 for _ in range(self.vp_size)]
+            self.current_microbatch = [0 for _ in range(self.vp_size)]
+        self.current_layer[vp_stage_index] = 1
+        self.current_microbatch[vp_stage_index] += 1
+
+    def on_save_for_backward(self, tensor: torch.Tensor) -> Any:
+        """
+        Hook called when autograd saves a tensor for backward pass.
+        Returns a tag to identify the tensor later.
+        """
+        # Handle 0-dim tensors (torch.Size([])) - they have no size(0)
+        if (
+            self.max_num_tokens is None
+            or tensor.dim() == 0
+            or not hasattr(tensor, 'grouped_tensor_scale_inv')
+        ):
+            return tensor.detach()
+
+        assert isinstance(tensor, torch.Tensor), f"tensor is not a torch.Tensor {type(tensor)}"
+
+        original_shape = tensor.shape
+        columnwise_scale_inv = tensor.grouped_tensor_scale_inv
+        tensor = tensor.flatten()
+        dtype = tensor.dtype
+        hidden_size = tensor.numel() // (
+            self.max_num_tokens
+            if not columnwise_scale_inv
+            else self.max_num_tokens // SCALE_INV_BLOCK_SIZE
+        )
+
+        if self.max_tokens_across_vp_stages is None:
+            self.max_tokens_across_vp_stages = {}
+            self.temp_tokens_across_vp_stages = {}
+            self.max_avg_tokens_across_vp_stages = {}
+            self.temp_avg_tokens_across_vp_stages = {}
+
+        avg_num_tokens = None
+        if self.status == 'capture':
+
+            self.num_tokens = self.num_tokens_tensor.item()
+            actual_num_tokens = (
+                self.num_tokens // SCALE_INV_BLOCK_SIZE if columnwise_scale_inv else self.num_tokens
+            )
+
+            avg_num_tokens = int(self.avg_num_tokens) if self.avg_num_tokens is not None else None
+
+            if (dtype, hidden_size) not in self.temp_tokens_across_vp_stages:
+                self.temp_tokens_across_vp_stages[dtype, hidden_size] = 0
+                self.max_tokens_across_vp_stages[dtype, hidden_size] = 0
+                self.temp_avg_tokens_across_vp_stages[dtype, hidden_size] = 0
+                self.max_avg_tokens_across_vp_stages[dtype, hidden_size] = 0
+
+            self.temp_tokens_across_vp_stages[dtype, hidden_size] += actual_num_tokens
+            self.max_tokens_across_vp_stages[dtype, hidden_size] = max(
+                self.max_tokens_across_vp_stages[dtype, hidden_size],
+                self.temp_tokens_across_vp_stages[dtype, hidden_size],
+            )
+
+            # Track avg tokens across vp stages (if provided) using the same accumulation model.
+            if avg_num_tokens is not None:
+                self.temp_avg_tokens_across_vp_stages[dtype, hidden_size] += (
+                    avg_num_tokens
+                    if not columnwise_scale_inv
+                    else avg_num_tokens // SCALE_INV_BLOCK_SIZE
+                )
+                self.max_avg_tokens_across_vp_stages[dtype, hidden_size] = max(
+                    self.max_avg_tokens_across_vp_stages[dtype, hidden_size],
+                    self.temp_avg_tokens_across_vp_stages[dtype, hidden_size],
+                )
+
+            # Since capture stage does not use CUDA graph, we can truncate
+            # the saved tensor to actual num_tokens
+            new_size = (actual_num_tokens * hidden_size,)
+
+            tensor_truncated = torch.empty(new_size, dtype=dtype, device=tensor.device)
+            tensor_truncated.copy_(tensor[: actual_num_tokens * hidden_size])
+            tensor = tensor_truncated
+
+        tensor.grouped_tensor_scale_inv = columnwise_scale_inv
+        paged_tensor = PagedTensor(
+            tensor,
+            num_tokens_tensor=self.num_tokens_tensor,
+            avg_num_tokens=avg_num_tokens,
+            vp_stage=self.current_vp_stage,
+            original_shape=original_shape,
+            schedule_layer_no=(
+                self._pp_schedule[self.current_schedule_index]
+                if self._pp_schedule is not None
+                and self.current_schedule_index < len(self._pp_schedule)
+                else None
+            ),
+            is_columnwise_scale_inv=columnwise_scale_inv,
+            max_num_tokens=self.max_num_tokens,
+            hidden_size=hidden_size,
+            page_size=self.page_size,
+        )
+
+        if self.status == 'captured':
+            self.add_paged_tensor_to_stash(paged_tensor)
+        return paged_tensor
+
+    def on_get_saved_tensor(self, saved_state: Any) -> torch.Tensor:
+        """
+        Hook called when autograd retrieves a saved tensor during backward pass.
+        Returns the actual tensor (potentially reloading from CPU).
+        """
+        if isinstance(saved_state, (PagedTensor)):
+            columnwise_scale_inv = saved_state.is_columnwise_scale_inv
+            if self.status == 'capture':
+                num_tokens = saved_state.num_tokens_tensor.item()
+                key = (saved_state.dtype, saved_state.hidden_size)
+                if key in self.temp_tokens_across_vp_stages:
+                    self.temp_tokens_across_vp_stages[key] -= (
+                        num_tokens
+                        if not columnwise_scale_inv
+                        else num_tokens // SCALE_INV_BLOCK_SIZE
+                    )
+                if (
+                    saved_state.avg_num_tokens is not None
+                    and key in self.temp_avg_tokens_across_vp_stages
+                ):
+                    self.temp_avg_tokens_across_vp_stages[key] -= (
+                        int(saved_state.avg_num_tokens)
+                        if not columnwise_scale_inv
+                        else int(saved_state.avg_num_tokens) // SCALE_INV_BLOCK_SIZE
+                    )
+
+                # Handle 1-byte tensors (torch.uint8)
+                dtype = saved_state._tensor.dtype
+                if saved_state._tensor.element_size() == 1:
+                    saved_state._tensor = saved_state._tensor.view(torch.uint8)
+
+                # Pad the tensor to the max number of tokens
+                # check if the tensor is 1D
+                assert (
+                    saved_state._tensor.ndim == 1
+                ), f"saved_state._tensor.ndim is not 1 {saved_state._tensor.ndim}"
+                npad = (self.max_num_tokens - num_tokens) * saved_state.hidden_size
+                if columnwise_scale_inv:
+                    npad = npad // SCALE_INV_BLOCK_SIZE
+                pad = (0, npad)
+                saved_state._tensor = torch.nn.functional.pad(saved_state._tensor, pad).view(dtype)
+
+            assert (
+                saved_state._tensor is not None
+            ), f"saved_state._tensor is None {saved_state._tensor}"
+
+            # Record cross-stream usage (important when tensor was produced on another stream).
+            if isinstance(saved_state._tensor, torch.Tensor) and saved_state._tensor.is_cuda:
+                saved_state._tensor.record_stream(torch.cuda.current_stream())
+
+            return saved_state._tensor.view(saved_state.original_shape)
+
+        return saved_state
+
+
+class PagedStashContext:
+    """Wrapper context manager that adds custom enter/exit behavior around saved_tensors_hooks."""
+
+    def __init__(self, stash_manager):
+        self.stash_manager = stash_manager
+        self.saved_tensors_context = torch.autograd.graph.saved_tensors_hooks(
+            stash_manager.on_save_for_backward, stash_manager.on_get_saved_tensor
+        )
+
+    def __enter__(self):
+        result = self.saved_tensors_context.__enter__()
+
+        # Add more custom logic after entering if needed
+        return result
+
+    def __exit__(self, *args: Any):
+        # Call the underlying context manager's __exit__
+        result = self.saved_tensors_context.__exit__(*args)
+        return result
+
+
+def paged_stash_group_start(tensor):
+    """Mark the start of a layer group and prepare for stash/reload."""
+    stash_manager = PagedStashManager.get_instance()
+    if not stash_manager.enabled:
+        return tensor
+    return PipelinePreScheduleFunction.apply(tensor, stash_manager)
+
+
+def get_paged_stash_context(
+    name=None, max_num_tokens=None, num_tokens_tensor=None, avg_num_tokens=None
+):
+    """Get the paged stash context"""
+    stash_manager = PagedStashManager.get_instance()
+    if not stash_manager.enabled:
+        return nullcontext()
+    stash_manager.max_num_tokens = max_num_tokens
+    stash_manager.avg_num_tokens = avg_num_tokens
+    assert num_tokens_tensor is not None and isinstance(num_tokens_tensor, torch.Tensor)
+    stash_manager.num_tokens_tensor = num_tokens_tensor
+    stash_manager.set_current_layer_name(name) if name is not None else None
+    pack_unpack_context = PagedStashContext(stash_manager)
+    return pack_unpack_context
+
+
+def paged_stash_group_commit(tensor, name=None):
+    """Mark the end of a layer group and prepare for stash/reload."""
+    stash_manager = PagedStashManager.get_instance()
+    stash_manager.device = tensor.device
+    if not stash_manager.enabled:
+        return tensor
+    return PipelinePostScheduleFunction.apply(tensor, stash_manager)
+
+
+def paged_stash_init_chunk_handler(vp_size, vp_stage):
+    """Initialize the chunk handler, called at the start of a microbatch forward pass."""
+    stash_manager = PagedStashManager.get_instance()
+    stash_manager.vp_size = vp_size if vp_size is not None else 1
+    stash_manager.current_vp_stage = vp_stage if vp_stage is not None else 0
+    stash_manager.update_model_chunk(stash_manager.current_vp_stage)
+
+
+def paged_stash_reset(enabled=True, config=None):
+    """Reset the chunk handler, called at the start of a training iteration.
+
+    config: optional TransformerConfig; if provided, moe_paged_stash_buffer_size_factor_cuda/cpu and
+    moe_paged_stash_page_size are read from it. Otherwise defaults to 1.10 (CUDA), 0.0 (CPU).
+    """
+    stash_manager = PagedStashManager.get_instance()
+    stash_manager.enabled = enabled
+    stash_manager.iteration += 1
+    if config is not None:
+        stash_manager.page_size = config.moe_paged_stash_page_size
+    # current layer and microbatch for each vp stage for forward pass
+    stash_manager.current_schedule_index = 0
+
+    if not enabled:
+        return
+
+    if stash_manager.status == 'begin':
+        stash_manager.status = 'capture'
+    elif stash_manager.status == 'capture':
+        stash_manager.status = 'captured'
+        cuda_factor = config.moe_paged_stash_buffer_size_factor_cuda if config is not None else 1.10
+        cpu_factor = config.moe_paged_stash_buffer_size_factor_cpu if config is not None else 0.0
+        stash_manager.allocate_stash_buffers(
+            moe_paged_stash_buffer_size_factor_cuda=cuda_factor,
+            moe_paged_stash_buffer_size_factor_cpu=cpu_factor,
+        )
+    elif stash_manager.status == 'captured':
+        # Buffers may have been released after a PagedStashRunner fallback; reallocate using
+        # the same capture-derived maxima and current config factors.
+        if stash_manager.stash_buffers is None:
+            cuda_factor = (
+                config.moe_paged_stash_buffer_size_factor_cuda if config is not None else 1.10
+            )
+            cpu_factor = (
+                config.moe_paged_stash_buffer_size_factor_cpu if config is not None else 0.0
+            )
+            stash_manager.allocate_stash_buffers(
+                moe_paged_stash_buffer_size_factor_cuda=cuda_factor,
+                moe_paged_stash_buffer_size_factor_cpu=cpu_factor,
+            )
+
+    if stash_manager.status == 'captured':
+        assert (
+            stash_manager.stash_buffers is not None
+        ), "Paged stash: captured state but stash_buffers is None after reset/allocation."
+        for dtype in stash_manager.stash_buffers.keys():
+            for hidden_size in stash_manager.stash_buffers[dtype].keys():
+                stash_manager.stash_buffers[dtype][hidden_size].reset()
+        stash_manager.overflow.zero_()
+        stash_manager.host_spill.zero_()
+        stash_manager.current_layer = [1 for _ in range(stash_manager.vp_size)]
+        stash_manager.current_microbatch = [0 for _ in range(stash_manager.vp_size)]
+        assert (
+            len(stash_manager.paged_tensors_to_stash) == 0
+        ), f"paged_tensors_to_stash is not empty {stash_manager.paged_tensors_to_stash}"
+        assert len(stash_manager.paged_tensors_stash_in_progress) == 0, (
+            f"paged_tensors_stash_in_progress is not empty "
+            f"{stash_manager.paged_tensors_stash_in_progress}"
+        )
+
+
+def check_paged_stash_overflow():
+    """Check if paged stash overflow"""
+    stash_manager = PagedStashManager.get_instance()
+    if not stash_manager.enabled or stash_manager.overflow is None:
+        return torch.zeros(1, dtype=torch.bool, device='cuda')
+    overflow = stash_manager.overflow.ne(0)
+    return overflow
+
+
+def check_paged_stash_host_spill():
+    """True if any activation was stashed to pinned host (successful spill, not overflow path)."""
+    stash_manager = PagedStashManager.get_instance()
+    if not stash_manager.enabled or stash_manager.host_spill is None:
+        return torch.zeros(1, dtype=torch.bool, device='cuda')
+    return stash_manager.host_spill.ne(0)
+
+
+class PagedStashRunner:
+    """Runner for paged stash"""
+
+    def __init__(self, config, copy_main_params, model, optimizer, forward_backward_func):
+        self.stash_manager = PagedStashManager.get_instance()
+        self.config = config
+        self.copy_main_params = copy_main_params
+        self.model = model
+        self.optimizer = optimizer
+        self.forward_backward_func = forward_backward_func
+        self.moe_layers = []
+        # TransformerConfig objects that must stay in sync for moe_paged_stash: the training
+        # loop `config` (schedules / paged_stash_reset) plus each VP chunk's GPT root config
+        # (GPTModel.forward). MoE mlps use the same config reference as that root, so we do
+        # not track mlp.config separately.
+        seen_cfg_ids = set()
+        self._configs_to_sync_moe_paged_stash = []
+
+        def _track_cfg(c):
+            if c is None:
+                return
+            cid = id(c)
+            if cid not in seen_cfg_ids:
+                seen_cfg_ids.add(cid)
+                self._configs_to_sync_moe_paged_stash.append(c)
+
+        _track_cfg(config)
+
+        for model_chunk in self.model:
+            model_with_decoder = get_attr_wrapped_model(
+                model_chunk, "decoder", allow_none=False, return_model_obj=True
+            )
+            _track_cfg(model_with_decoder.config)
+            for layer in model_with_decoder.decoder.layers:
+                mlp = layer.mlp
+                if hasattr(mlp, 'token_dispatcher') and hasattr(
+                    mlp.token_dispatcher, 'check_over_budget'
+                ):
+                    self.moe_layers.append(mlp)
+            if model_with_decoder.mtp_process:
+                for layer in model_with_decoder.mtp.layers:
+                    mlp = layer.mtp_model_layer.mlp
+                    if hasattr(mlp, 'token_dispatcher') and hasattr(
+                        mlp.token_dispatcher, 'check_over_budget'
+                    ):
+                        self.moe_layers.append(mlp)
+
+    def _set_moe_paged_stash_all(self, value: bool) -> None:
+        """Set moe_paged_stash on every tracked config (train + per VP chunk root)."""
+        for c in self._configs_to_sync_moe_paged_stash:
+            c.moe_paged_stash = value
+
+    def data_read(self, data_iterator, model, training, num_microbatches):
+        """Read all microbatch inputs from Dataloader and copy to static buffers."""
+        data_iterator_saved = []
+        if not isinstance(model, list) or len(model) == 1:
+            assert not isinstance(data_iterator, list) or len(data_iterator) == 1
+            iterator0 = data_iterator if not isinstance(data_iterator, list) else data_iterator[0]
+            data_list = []
+            if iterator0 is not None:
+                for b in range(num_microbatches):
+                    data_list.append(next(iterator0))
+                data_iterator_saved.append(iter(data_list))
+                data_list = [iter(data_list)]
+            else:
+                data_iterator_saved.append(None)
+                data_list.append(None)
+        else:
+            assert isinstance(data_iterator, list) and len(data_iterator) == len(model)
+            data_list = []
+            for i in range(len(model)):
+                if data_iterator[i] is not None:
+                    data_list_i = []
+                    for b in range(num_microbatches):
+                        data_list_i.append(next(data_iterator[i]))
+                    data_iterator_saved.append(iter(data_list_i))
+                    data_list.append(iter(data_list_i))
+                else:
+                    data_iterator_saved.append(None)
+                    data_list.append(None)
+        return data_iterator_saved, data_list
+
+    def check_moe_overflow(self):
+        """(stash_overflow_rank_sum, overbudget_rank_sum, host_spill_rank_sum); one all_reduce."""
+        stash_overflow = check_paged_stash_overflow().view(-1)[0]
+        host_spill = check_paged_stash_host_spill().view(-1)[0]
+        overbudget = torch.zeros(1, dtype=torch.bool, device=stash_overflow.device).view(-1)[0]
+        for mlp in self.moe_layers:
+            ob = mlp.token_dispatcher.check_over_budget()
+            if ob is not None:
+                overbudget |= ob.view(-1)[0]
+
+        flags = torch.stack(
+            [
+                stash_overflow.to(torch.int32),
+                overbudget.to(torch.int32),
+                host_spill.to(torch.int32),
+            ],
+            dim=0,
+        )
+        torch.distributed.all_reduce(flags, op=torch.distributed.ReduceOp.SUM)
+        return flags[0].item(), flags[1].item(), flags[2].item()
+
+    def prepare_for_rerun(self, is_training=True):
+        """Prepare for rerun"""
+        log_single_rank(
+            logger,
+            logging.INFO,
+            "Paged stash: rerunning forward-backward without "
+            "moe_expert_rank_capacity_factor padding and with moe_paged_stash disabled.",
+        )
+        # check for token dispatcher overflow
+        for mlp in self.moe_layers:
+            if hasattr(mlp, 'token_dispatcher') and hasattr(
+                mlp.token_dispatcher._comm_manager, 'moe_expert_rank_capacity_factor'
+            ):
+                mlp.token_dispatcher._comm_manager.moe_expert_rank_capacity_factor = None
+                mlp.token_dispatcher.reset_over_budget()
+        if self.stash_manager.overflow is not None:
+            self.stash_manager.overflow.zero_()
+        if self.stash_manager.host_spill is not None:
+            self.stash_manager.host_spill.zero_()
+        self._set_moe_paged_stash_all(False)
+
+        # Set grad to zero.
+        for model_chunk in self.model:
+            model_chunk.zero_grad_buffer()
+        if self.optimizer is not None:
+            self.optimizer.zero_grad()
+
+        # _handle_mxfp8_param_buffer_copy
+        if self.copy_main_params:
+
+            def _try_copy_main_params(opt):
+                if isinstance(opt, DistributedOptimizer) and hasattr(
+                    opt, 'shard_fp32_from_float16_groups'
+                ):
+                    opt._copy_main_params_to_param_buffer()
+
+            # Handle both ChainedOptimizer and direct DistributedOptimizer cases
+            # Note: FSDP's DistributedOptimizer doesn't have shard_fp32_from_float16_groups,
+            # so we check for this attribute before calling _copy_main_params_to_param_buffer
+            if self.optimizer is not None:
+                if hasattr(self.optimizer, 'chained_optimizers'):
+                    for optim_instance in self.optimizer.chained_optimizers:
+                        _try_copy_main_params(optim_instance)
+                else:
+                    _try_copy_main_params(self.optimizer)
+
+        # Delete the CUDA graph before releasing stash tensors the captured graph may reference.
+        if isinstance(self.forward_backward_func, FullCudaGraphWrapper):
+            self.forward_backward_func.reset_cuda_graph(
+                stage='training' if is_training else 'validation'
+            )
+
+        # Only drop page buffers on training fallback. Validation uses forward_only=True, so
+        # paged_stash_reset disables the stash manager and eval forward never reads/writes the
+        # large page buffers—freeing them here saves almost nothing. If we released on eval,
+        # the next training step would realloc new buffer addresses while the training
+        # FullCudaGraphWrapper could still replay a graph recorded against the old pointers.
+        # Training fallback resets the training graph before this path, so release + realloc
+        # remains consistent with capture.
+        if is_training:
+            self.stash_manager.release_stash_buffers()
+
+    def __call__(self, *args, **kwargs):
+        """Run the paged stash"""
+        assert len(args) == 0, 'forward_backward_func does not accept positional args'
+        assert all(
+            [
+                kwarg in kwargs
+                for kwarg in [
+                    'model',
+                    'data_iterator',
+                    'num_microbatches',
+                    'seq_length',
+                    'forward_only',
+                ]
+            ]
+        )
+        model = kwargs['model']
+        num_microbatches = kwargs['num_microbatches']
+
+        training = not kwargs['forward_only']
+        data_iterator = kwargs['data_iterator']
+        saved_moe_paged_stash = self.config.moe_paged_stash
+        num_tries = 0
+        while True:
+            assert (
+                num_tries < 2
+            ), f"PagedStashRunner: num_tries {num_tries} exceeded max attempts!!!"
+            num_tries += 1
+            data_iterator, data_list = self.data_read(
+                data_iterator, model, training, num_microbatches
+            )
+
+            kwargs['data_iterator'] = data_list
+            result = self.forward_backward_func(*args, **kwargs)
+
+            stash_overflow_ranks, overbudget_ranks, host_spill_ranks = self.check_moe_overflow()
+            # if no overflow, set the expert_rank_capacity_factor to the original value
+            if stash_overflow_ranks == 0 and overbudget_ranks == 0:
+                if host_spill_ranks > 0:
+                    log_single_rank(
+                        logger,
+                        logging.INFO,
+                        "Paged stash: spilled activations to pinned host "
+                        f"on {host_spill_ranks} rank(s) (CUDA stash full). "
+                        "Consider increasing moe_paged_stash_buffer_size_factor_cuda for "
+                        "potentially better performance.",
+                    )
+                for mlp in self.moe_layers:
+                    if hasattr(mlp, 'token_dispatcher') and hasattr(
+                        mlp.token_dispatcher._comm_manager, 'moe_expert_rank_capacity_factor'
+                    ):
+                        mlp.token_dispatcher._comm_manager.moe_expert_rank_capacity_factor = (
+                            mlp.token_dispatcher.config.moe_expert_rank_capacity_factor
+                        )
+                self._set_moe_paged_stash_all(saved_moe_paged_stash)
+                break
+
+            # if overflow or overbudget, set the expert_rank_capacity_factor to None
+            if overbudget_ranks > 0:
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    "Paged stash: token drop during MoE token dispatch (over budget) "
+                    f"on {overbudget_ranks} rank(s). "
+                    "Consider increasing moe_expert_rank_capacity_factor.",
+                )
+            if stash_overflow_ranks > 0:
+                log_single_rank(
+                    logger,
+                    logging.INFO,
+                    "Paged stash: stashing buffer overflow "
+                    f"on {stash_overflow_ranks} rank(s). "
+                    "Consider increasing moe_paged_stash_buffer_size_factor_cuda or "
+                    "moe_paged_stash_buffer_size_factor_cpu.",
+                )
+            self.prepare_for_rerun(is_training=training)
+        return result
diff --git a/megatron/core/transformer/moe/router.py b/megatron/core/transformer/moe/router.py
index a773775a299..cdf968c6b12 100644
--- a/megatron/core/transformer/moe/router.py
+++ b/megatron/core/transformer/moe/router.py
@@ -7,6 +7,7 @@
 
 from megatron.core.jit import jit_fuser
 from megatron.core.transformer.module import MegatronModule
+from megatron.core.transformer.moe.moe_logging import get_moe_metrics_tracker
 from megatron.core.transformer.moe.moe_utils import (
     MoEAuxLossAutoScaler,
     ProcessGroupCollection,
@@ -16,7 +17,6 @@
     compute_routing_scores_for_aux_loss,
     get_tokens_per_expert_and_token_count,
     router_gating_linear,
-    save_to_aux_losses_tracker,
     sinkhorn,
     switch_load_balancing_loss_func,
     topk_routing_with_score_function,
@@ -34,6 +34,7 @@ def __init__(
         config: TransformerConfig,
         pg_collection: Optional[ProcessGroupCollection] = None,
         is_mtp_layer: bool = False,
+        layer_number: Optional[int] = None,
     ) -> None:
         """
         Initialize the Router module.
@@ -47,7 +48,7 @@ def __init__(
         self.config = config
         self.num_experts = self.config.num_moe_experts
         self.moe_aux_loss_func = None
-        self.layer_number = None
+        self.layer_number = layer_number
         self.is_mtp_layer = is_mtp_layer
         self.tp_group = pg_collection.tp
         self.cp_group = pg_collection.cp
@@ -155,6 +156,7 @@ def __init__(
         config: TransformerConfig,
         pg_collection: Optional[ProcessGroupCollection] = None,
         is_mtp_layer: bool = False,
+        layer_number: Optional[int] = None,
     ) -> None:
         """Initialize the zero token dropping router.
 
@@ -163,13 +165,41 @@ def __init__(
             pg_collection (ProcessGroupCollection, optional): Process groups for MoE operations.
             is_mtp_layer (bool): Flag indicating if this router is part of an MTP layer.
         """
-        super().__init__(config=config, pg_collection=pg_collection, is_mtp_layer=is_mtp_layer)
+        super().__init__(
+            config=config,
+            pg_collection=pg_collection,
+            is_mtp_layer=is_mtp_layer,
+            layer_number=layer_number,
+        )
         self.topk = self.config.moe_router_topk
         self.routing_type = self.config.moe_router_load_balancing_type
         self.score_function = self.config.moe_router_score_function
         self.input_jitter = None
 
-        self.enable_expert_bias = self.config.moe_router_enable_expert_bias
+        if self.config.moe_n_hash_layers > 0:
+            assert layer_number is not None, "layer_number is required for the hash-based router."
+        self.is_hash_layer = (
+            not self.is_mtp_layer
+            and self.config.moe_n_hash_layers > 0
+            and layer_number <= self.config.moe_n_hash_layers
+        )
+        if self.is_hash_layer:
+            # DSv4-Pro ships a pre-trained tid2eid table in its inference checkpoint;
+            # no public initialization recipe is documented. Round-robin is used here
+            # only as a placeholder so the layer is runnable from scratch.
+            vocab_size = self.config.actual_vocab_size
+            num_experts = self.config.num_moe_experts
+            ids = torch.arange(vocab_size, device=torch.cuda.current_device())
+            tid2eid = torch.stack([(ids + k) % num_experts for k in range(self.topk)], dim=1).to(
+                torch.int32
+            )
+            self.register_buffer('tid2eid', tid2eid)
+        else:
+            self.tid2eid = None
+
+        self.enable_expert_bias = (
+            self.config.moe_router_enable_expert_bias and not self.is_hash_layer
+        )
         if self.enable_expert_bias:
             self.register_buffer(
                 'local_tokens_per_expert',
@@ -312,6 +342,7 @@ def _apply_aux_loss(
             moe_aux_loss_coeff=aux_loss_coeff,
             fused=self.config.moe_router_fusion,
         )
+
         probs = self.attach_and_log_load_balancing_loss(
             probs,
             aux_loss_coeff,
@@ -398,7 +429,6 @@ def _apply_global_aux_loss(
                 topk=self.topk,
             )
         )
-
         self.global_tokens_per_expert += global_tokens_per_expert
         self.ga_steps += 1
         averated_tokens_per_expert = self.global_tokens_per_expert / self.ga_steps
@@ -412,13 +442,14 @@ def _apply_global_aux_loss(
             moe_aux_loss_coeff=global_aux_loss_coeff,
             fused=self.config.moe_router_fusion,
         )
+
         probs = self.attach_and_log_load_balancing_loss(
             probs,
             global_aux_loss_coeff,
             global_aux_loss,
             "global_load_balancing_loss",
             self.tp_dp_cp_group,
-            reduce_group_has_dp=True,
+            needs_dp_avg=False,
             valid_token_count=local_num_tokens,
         )
         return probs
@@ -430,7 +461,7 @@ def attach_and_log_load_balancing_loss(
         aux_loss: torch.Tensor,
         aux_loss_name: str,
         reduce_group: torch.distributed.ProcessGroup,
-        reduce_group_has_dp: bool = False,
+        needs_dp_avg: bool = True,
         valid_token_count: Optional[Union[int, torch.Tensor]] = None,
     ):
         """Attach aux loss function to activation and add to logging.
@@ -441,9 +472,7 @@ def attach_and_log_load_balancing_loss(
             aux_loss (torch.Tensor): Computed aux loss.
             aux_loss_name (str): Name of the aux loss for logging.
             reduce_group (torch.distributed.ProcessGroup): Process group for reduction.
-            reduce_group_has_dp (bool): Whether the reduce group has data parallel ranks.
-                Set this to True if the reduce group has data parallel ranks. This flag is used to
-                ensure the correct reduction in aux loss tracking.
+            needs_dp_avg (bool): Whether to average this metric across DP ranks after reduce_group.
             valid_token_count (int or torch.Tensor, optional): Number of valid tokens excluding
                 padding tokens. Can be a Python int or a torch.Tensor (typically 0-d tensor).
                 If None, uses activation.shape[0]. Defaults to None.
@@ -471,13 +500,13 @@ def attach_and_log_load_balancing_loss(
         else:
             layer_number = self.layer_number
 
-        save_to_aux_losses_tracker(
+        get_moe_metrics_tracker().record(
             aux_loss_name,
             aux_loss / aux_loss_coeff,
             layer_number,
             num_layers,
             reduce_group=reduce_group,
-            reduce_group_has_dp=reduce_group_has_dp,
+            needs_dp_avg=needs_dp_avg,
         )
         if self.calculate_per_token_loss:
             # Scale the aux_loss by the number of tokens.
@@ -544,7 +573,7 @@ def apply_z_loss(self, logits, padding_mask: Optional[torch.Tensor] = None):
             else:
                 layer_number = self.layer_number
 
-            save_to_aux_losses_tracker(
+            get_moe_metrics_tracker().record(
                 "z_loss", z_loss / moe_z_loss_coeff, layer_number, num_layers
             )
         return logits
@@ -584,7 +613,53 @@ def _apply_expert_bias(
                     routing_map = routing_map & (~padding_mask)
                 self.local_tokens_per_expert += routing_map.sum(dim=0)
 
-    def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
+    def _hash_routing(self, logits: torch.Tensor, input_ids: torch.Tensor):
+        """Hash-based routing: expert indices come from the tid2eid lookup table.
+
+        Scores are still computed from the gating logits for weight computation,
+        but expert selection is determined by the pre-computed hash table.
+
+        Args:
+            logits (torch.Tensor): Gating logits, shape [num_tokens, num_experts].
+            input_ids (torch.Tensor): Token IDs, shape [seq_length, bsz].
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: routing_probs and routing_map.
+        """
+        num_tokens, num_experts = logits.shape
+
+        if self.score_function == "softmax":
+            scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+        elif self.score_function == "sigmoid":
+            scores = torch.sigmoid(logits.float()).type_as(logits)
+        elif self.score_function == "sqrtsoftplus":
+            scores = torch.nn.functional.softplus(logits.float()).sqrt().type_as(logits)
+        else:
+            raise ValueError(f"Invalid score_function: {self.score_function}")
+
+        # input_ids is [b, s] from the model, but hidden_states are [s, b, h]
+        # and get flattened to [s*b, h]. Transpose to match.
+        flat_ids = input_ids.T.reshape(-1)
+        top_indices = self.tid2eid[flat_ids].long()  # [num_tokens, topk]
+
+        probs = scores.gather(1, top_indices)
+        if self.score_function != "softmax":
+            probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-20)
+
+        if self.config.moe_router_topk_scaling_factor:
+            probs = probs * self.config.moe_router_topk_scaling_factor
+
+        routing_probs = torch.zeros_like(logits).scatter(1, top_indices, probs)
+        routing_map = torch.zeros_like(logits).int().scatter(1, top_indices, 1).bool()
+
+        return routing_probs, routing_map
+
+    def routing(
+        self,
+        logits: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+    ):
         """Top-k routing function
 
         Args:
@@ -592,6 +667,8 @@ def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = N
             padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
                                                    Shape [seq_length, bsz]. True for valid tokens,
                                                    False for padding tokens. Defaults to None.
+            input_ids (torch.Tensor, optional): The input IDs tensor. Shape [seq_length, bsz].
+                                                Defaults to None.
 
         Returns:
             probs (torch.Tensor): The probabilities of token to experts assignment.
@@ -609,7 +686,13 @@ def routing(self, logits: torch.Tensor, padding_mask: Optional[torch.Tensor] = N
         logits = self.apply_z_loss(logits, padding_mask=padding_mask)
 
         # Calculate probs and routing_map for token dispatching
-        if self.routing_type == "sinkhorn":
+        if self.is_hash_layer:
+            assert input_ids is not None, (
+                "input_ids is required for hash-based routing but was None. "
+                "Ensure --moe-n-hash-layers is set correctly and input_ids are passed."
+            )
+            probs, routing_map = self._hash_routing(logits, input_ids)
+        elif self.routing_type == "sinkhorn":
             probs, routing_map = self.sinkhorn_load_balancing(logits)
         else:
             probs, routing_map = topk_routing_with_score_function(
@@ -678,7 +761,12 @@ def reset_global_aux_loss_tracker(self):
             self.global_tokens_per_expert.zero_()
             self.ga_steps.zero_()
 
-    def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = None):
+    def forward(
+        self,
+        input: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+    ):
         """
         Forward pass of the router.
 
@@ -687,6 +775,8 @@ def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = No
             padding_mask (torch.Tensor, optional): Boolean mask indicating non-padding tokens.
                                                    Shape [seq_length, bsz]. True for valid tokens,
                                                    False for padding tokens. Defaults to None.
+            input_ids (torch.Tensor, optional): The input IDs tensor. Shape [seq_length, bsz].
+                                                Defaults to None.
         """
         self._maintain_float32_expert_bias()
 
@@ -704,7 +794,7 @@ def forward(self, input: torch.Tensor, padding_mask: Optional[torch.Tensor] = No
                 logits, self.config.moe_router_force_biased, self.layer_number
             )
 
-        probs, routing_map = self.routing(logits, padding_mask=padding_mask)
+        probs, routing_map = self.routing(logits, padding_mask=padding_mask, input_ids=input_ids)
 
         return probs, routing_map
 
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
index 15fc8b984b3..e27e7a7088a 100644
--- a/megatron/core/transformer/moe/token_dispatcher.py
+++ b/megatron/core/transformer/moe/token_dispatcher.py
@@ -37,6 +37,8 @@
 from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
 from megatron.core.transformer.transformer_config import TransformerConfig
 
+logger = logging.getLogger(__name__)
+
 """ We use the following notation throughout this file:
      H: hidden size
      B: micro batch size
@@ -47,8 +49,6 @@
      num_global_tokens: num_local_tokens*TP*EP
 """
 
-logger = logging.getLogger(__name__)
-
 
 class MoETokenDispatcher:
     """
@@ -78,12 +78,30 @@ def __init__(
         self.tp_size = utils.get_pg_size(self.tp_group)
         self.tp_rank = utils.get_pg_rank(self.tp_group)
         self.ep_size = utils.get_pg_size(self.ep_group)
+        self.ep_rank = utils.get_pg_rank(self.ep_group)
 
         # Attributes that need to be captured in cudagraph. These attributes are returned
         # as cudagraph outputs when the cuda_graph_scope contains moe_preprocess.
         self.cudagraph_attrs = []
         self.valid_cudagraph_attrs = None
 
+    def get_cudagraph_attr(self, attr_name: str):
+        """Resolve a cudagraph attribute path, including nested attributes."""
+        attr = self
+        for name in attr_name.split('.'):
+            attr = getattr(attr, name, None)
+            if attr is None:
+                return None
+        return attr
+
+    def set_cudagraph_attr(self, attr_name: str, value) -> None:
+        """Assign to a cudagraph attribute path, including nested attributes."""
+        hier_attr_name = attr_name.split('.')
+        attr = self
+        for name in hier_attr_name[:-1]:
+            attr = getattr(attr, name)
+        setattr(attr, hier_attr_name[-1], value)
+
     @abstractmethod
     def dispatch_preprocess(
         self, tokens: torch.Tensor, routing_map: torch.Tensor, probs: torch.Tensor
@@ -1029,10 +1047,23 @@ def __init__(
                 "https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep."
             )
 
+        self.moe_expert_rank_capacity_factor = self.config.moe_expert_rank_capacity_factor
+        self.over_budget = torch.zeros(1, dtype=torch.bool, device='cuda')
+
     def setup_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor):
         num_tokens = routing_map.shape[0]
         self.routing_map = routing_map.reshape(num_tokens, self.num_experts)
         self.token_probs = probs.reshape(num_tokens, self.num_experts)
+
+        if self.moe_expert_rank_capacity_factor is not None:
+            pad_multiple = get_align_size_for_quantization(self.config)
+            budget = int(
+                routing_map.shape[0]
+                * self.config.moe_router_topk
+                * self.moe_expert_rank_capacity_factor
+            )
+            budget += -budget % pad_multiple
+            self.num_permuted_tokens = budget
         # Compute the capacity for each expert at the drop_and_pad mode
         if self.drop_and_pad:
             num_out_tokens = num_tokens * self.config.moe_router_topk
@@ -1073,16 +1104,24 @@ def dispatch(
                 num_local_experts=self.num_local_experts,
                 num_sms_dispatch_api=self.config.moe_hybridep_num_sms,
                 num_sms_combine_api=self.config.moe_hybridep_num_sms,
+                num_blocks_permute=self.config.moe_hybridep_num_blocks_permute,
+                num_blocks_unpermute=self.config.moe_hybridep_num_blocks_unpermute,
                 num_permuted_tokens=self.num_permuted_tokens,
                 pad_multiple=self.pad_multiple,
+                fused=self.config.moe_permute_fusion_into_hybridep,
+                num_sms_preprocessing_api=self.config.moe_hybridep_num_sms_preprocessing,
             )
         )
+        if self.moe_expert_rank_capacity_factor is not None:
+            over_budget = self.handle[-1] != 0  # this is overflow_flag
+            self.over_budget |= over_budget
 
-        if not self.drop_and_pad:
-            self.tokens_per_expert = tokens_per_expert
+        if self.num_permuted_tokens is None:
+            self.tokens_per_expert = tokens_per_expert.to(torch.int64)
             # self.num_permuted_tokens is necessary to allocate the output tensor for permute
             self.num_permuted_tokens = self.tokens_per_expert.sum()
-
+        if self.moe_expert_rank_capacity_factor is not None:
+            self.tokens_per_expert = tokens_per_expert.to(torch.int64)
         return dispatched_hidden
 
     def combine(
@@ -1096,6 +1135,7 @@ def combine(
             handle=self.handle,
             num_permuted_tokens=self.num_permuted_tokens,
             pad_multiple=self.pad_multiple,
+            fused=self.config.moe_permute_fusion_into_hybridep,
         )
         # Release the used handle/num_permuted_tokens which could change in each iteration.
         # For drop_and_pad mode, we don't need to reset the num_permuted_tokens and
@@ -1385,8 +1425,8 @@ def __init__(
 
         self.num_local_experts = num_local_experts
         self.local_expert_indices = local_expert_indices
-        assert self.tp_size * self.ep_size > 1, "Flex token dispatcher requires TPxEP > 1"
         if self.config.moe_flex_dispatcher_backend == "deepep":
+            assert self.tp_size * self.ep_size > 1, "DeepEP dispatcher requires TPxEP > 1"
             self._comm_manager = _DeepepManager(
                 group=self.tp_ep_group,
                 num_local_experts=self.num_local_experts,
@@ -1435,6 +1475,7 @@ def _initialize_metadata(self, routing_map: torch.Tensor, probs: torch.Tensor) -
             .expand(-1, -1, self.tp_size, -1)
             .reshape(num_local_tokens, world_size, self.num_local_experts)
         ).contiguous()
+
         return routing_map, probs
 
     @jit_fuser
@@ -1569,3 +1610,15 @@ def combine_postprocess(self, hidden_states: torch.Tensor):
             self.shared_experts.post_forward_comm()
             hidden_states += self.shared_experts.get_output()
         return hidden_states.view(self.hidden_shape)
+
+    def check_over_budget(self):
+        """Check if the dispatcher has exceeded its budget."""
+        if hasattr(self._comm_manager, 'over_budget'):
+            return self._comm_manager.over_budget
+        else:
+            return None
+
+    def reset_over_budget(self):
+        """Reset the accumulated over-budget flag on the communication manager."""
+        if hasattr(self._comm_manager, 'over_budget'):
+            self._comm_manager.over_budget.fill_(0)
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
index 601ae89fae1..c8f00084c5d 100644
--- a/megatron/core/transformer/multi_latent_attention.py
+++ b/megatron/core/transformer/multi_latent_attention.py
@@ -139,6 +139,7 @@ def __init__(
         cp_comm_type: Optional[str] = None,
         pg_collection: Optional[ProcessGroupCollection] = None,
         pp_layer_offset: Optional[int] = None,
+        is_mtp_layer: bool = False,
     ) -> None:
 
         super().__init__(
@@ -149,6 +150,7 @@ def __init__(
             attn_mask_type=attn_mask_type,
             pg_collection=pg_collection,
             pp_layer_offset=pp_layer_offset,
+            is_mtp_layer=is_mtp_layer,
         )
         self.config: MLATransformerConfig
 
@@ -332,7 +334,8 @@ def forward(
         # Get the query, key and value tensors based on the type of attention -
         # self or cross attn.
         # query: [96, 1, 16, 128], key:[96, 1, 16, 128], value:[96, 1, 16, 128]
-        with off_interface(self.offload_qkv_linear, hidden_states, "qkv_linear") as hidden_states:
+        qkv_linear_manager = off_interface(self.offload_qkv_linear, hidden_states, "qkv_linear")
+        with qkv_linear_manager as hidden_states:
             query, key, value, q_compressed, kv_compressed = self.get_query_key_value_tensors(
                 hidden_states,
                 key_value_states,
@@ -340,10 +343,7 @@ def forward(
                 packed_seq_params,
                 inference_context=inference_context,
             )
-        if self.offload_qkv_linear:
-            query = off_interface.group_commit(
-                query, name="qkv_linear", forced_released_tensors=[hidden_states]
-            )
+        query = qkv_linear_manager.group_offload(query, forced_released_tensors=[hidden_states])
 
         # ===================================================
         # Adjust key, value for inference
@@ -450,12 +450,10 @@ def forward(
         # =================
         # Output. [sq, b, h]
         # =================
-        with off_interface(self.offload_attn_proj, core_attn_out, "attn_proj") as core_attn_out:
+        attn_proj_manager = off_interface(self.offload_attn_proj, core_attn_out, "attn_proj")
+        with attn_proj_manager as core_attn_out:
             output, bias = self.linear_proj(core_attn_out)
-        if self.offload_attn_proj:
-            output = off_interface.group_commit(
-                output, name="attn_proj", forced_released_tensors=[core_attn_out]
-            )
+        output = attn_proj_manager.group_offload(output, forced_released_tensors=[core_attn_out])
 
         return output, bias
 
@@ -476,6 +474,7 @@ def __init__(
         cp_comm_type: Optional[str] = None,
         pg_collection: Optional[ProcessGroupCollection] = None,
         pp_layer_offset: Optional[int] = None,
+        is_mtp_layer: bool = False,
     ):
         if pg_collection is None:
             pg_collection = ProcessGroupCollection.use_mpu_process_groups()
@@ -489,6 +488,7 @@ def __init__(
             cp_comm_type=cp_comm_type,
             pg_collection=pg_collection,
             pp_layer_offset=pp_layer_offset,
+            is_mtp_layer=is_mtp_layer,
         )
 
         if self.config.q_lora_rank is None:
@@ -659,8 +659,8 @@ def get_query_key_value_tensors(
         if packed_seq_params is not None:
             assert (
                 packed_seq_params.local_cp_size is None
-            ), "hybrid_context_parallel is not supported with MLA yet and is planned for future. \
-            Please disable hybrid_context_parallel."
+            ), "dynamic_context_parallel is not supported with MLA yet and is planned for future. \
+            Please disable dynamic_context_parallel."
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
@@ -915,6 +915,7 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
                     cu_seqlens=cu_seqlens_q,
                     mscale=mscale,
                     cp_group=self.pg_collection.cp,
+                    mla_rotary_interleaved=True,
                 )
                 # k_pos_emb:[num_tokens, 1, qk_pos_emb_head_dim]
                 k_pos_emb = apply_rotary_pos_emb(
@@ -924,6 +925,7 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
                     cu_seqlens=cu_seqlens_kv,
                     mscale=mscale,
                     cp_group=self.pg_collection.cp,
+                    mla_rotary_interleaved=True,
                 )
 
                 # query: [num_tokens, n, (qk_head_dim + v_head_dim)]
@@ -1212,6 +1214,7 @@ def __init__(
         attn_mask_type=AttnMaskType.padding,
         cp_comm_type: Optional[str] = None,
         pg_collection: Optional[ProcessGroupCollection] = None,
+        is_mtp_layer: bool = False,
     ):
         if pg_collection is None:
             pg_collection = ProcessGroupCollection.use_mpu_process_groups()
@@ -1225,6 +1228,7 @@ def __init__(
             attention_type="self",
             cp_comm_type=cp_comm_type,
             pg_collection=pg_collection,
+            is_mtp_layer=is_mtp_layer,
         )
 
         assert self.config.q_lora_rank is not None, (
diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py
index b70fdc3f28b..e45064e858c 100755
--- a/megatron/core/transformer/multi_token_prediction.py
+++ b/megatron/core/transformer/multi_token_prediction.py
@@ -7,6 +7,7 @@
 from typing import TYPE_CHECKING, Callable, List, Optional, Union
 
 import torch
+import torch.nn as nn
 from torch import Tensor
 
 from megatron.core import InferenceParams, parallel_state, tensor_parallel
@@ -23,6 +24,7 @@
     scatter_to_sequence_parallel_region,
 )
 from megatron.core.transformer.enums import AttnMaskType, LayerType
+from megatron.core.transformer.hyper_connection import learned_output_contract
 from megatron.core.transformer.module import MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.torch_norm import LayerNormBuilder
@@ -427,11 +429,15 @@ class MultiTokenPredictionLayerSubmodules:
     layer_norm: LayerNormBuilder
 
     eh_proj: Union[ModuleSpec, type] = None
+    e_proj: Union[ModuleSpec, type] = None
+    h_proj: Union[ModuleSpec, type] = None
     mtp_model_layer: Union[ModuleSpec, type] = None
 
 
 def get_mtp_layer_spec(
-    mtp_model_layer_spec: ModuleSpec, use_transformer_engine: bool
+    mtp_model_layer_spec: ModuleSpec,
+    use_transformer_engine: bool,
+    enable_hyper_connections: bool = False,
 ) -> ModuleSpec:
     """Get the MTP layer spec.
 
@@ -441,11 +447,14 @@ def get_mtp_layer_spec(
     return get_mtp_layer_spec_for_backend(
         mtp_model_layer_spec,
         backend=TESpecProvider() if use_transformer_engine else LocalSpecProvider(),
+        enable_hyper_connections=enable_hyper_connections,
     )
 
 
 def get_mtp_layer_spec_for_backend(
-    mtp_model_layer_spec: ModuleSpec, backend: BackendSpecProvider
+    mtp_model_layer_spec: ModuleSpec,
+    backend: BackendSpecProvider,
+    enable_hyper_connections: bool = False,
 ) -> ModuleSpec:
     """Get the MTP layer spec.
 
@@ -454,15 +463,22 @@ def get_mtp_layer_spec_for_backend(
     """
     column_parallel_linear_impl: type = backend.column_parallel_linear()
     layer_norm_impl = backend.layer_norm()
+
+    submodules_kwargs = dict(
+        enorm=layer_norm_impl,
+        hnorm=layer_norm_impl,
+        mtp_model_layer=mtp_model_layer_spec,
+        layer_norm=layer_norm_impl,
+    )
+    if enable_hyper_connections:
+        submodules_kwargs["e_proj"] = column_parallel_linear_impl
+        submodules_kwargs["h_proj"] = column_parallel_linear_impl
+    else:
+        submodules_kwargs["eh_proj"] = column_parallel_linear_impl
+
     mtp_layer_spec = ModuleSpec(
         module=MultiTokenPredictionLayer,
-        submodules=MultiTokenPredictionLayerSubmodules(
-            enorm=layer_norm_impl,
-            hnorm=layer_norm_impl,
-            eh_proj=column_parallel_linear_impl,
-            mtp_model_layer=mtp_model_layer_spec,
-            layer_norm=layer_norm_impl,
-        ),
+        submodules=MultiTokenPredictionLayerSubmodules(**submodules_kwargs),
     )
     return mtp_layer_spec
 
@@ -661,26 +677,39 @@ def process_mtp_loss(
     # correctly scaled relative to the main loss gradients in finalize_model_grads.
     original_num_tokens = loss_mask.sum()
 
+    fuse_linear_cross_entropy = (
+        config.cross_entropy_loss_fusion and config.cross_entropy_fusion_impl == "linear"
+    )
     for mtp_layer_number in range(config.mtp_num_layers):
-        mtp_logits, _ = output_layer(
-            hidden_states_list[mtp_layer_number + 1],
-            weight=output_weight,
-            runtime_gather_output=runtime_gather_output,
-        )
-        if scale_logits_fn is not None:
-            mtp_logits = scale_logits_fn(mtp_logits)
         mtp_labels, _ = roll_tensor(
             mtp_labels, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params
         )
         loss_mask, num_tokens = roll_tensor(
             loss_mask, shifts=-1, dims=-1, cp_group=cp_group, packed_seq_params=packed_seq_params
         )
-        mtp_loss = compute_language_model_loss(mtp_labels, mtp_logits)
+        if fuse_linear_cross_entropy:
+            mtp_loss = output_layer(
+                hidden_states_list[mtp_layer_number + 1],
+                weight=output_weight,
+                runtime_gather_output=runtime_gather_output,
+                output_cross_entropy_loss=True,
+                labels=mtp_labels,
+            )
+        else:
+            mtp_logits, _ = output_layer(
+                hidden_states_list[mtp_layer_number + 1],
+                weight=output_weight,
+                runtime_gather_output=runtime_gather_output,
+            )
+            if scale_logits_fn is not None:
+                mtp_logits = scale_logits_fn(mtp_logits)
+            mtp_loss = compute_language_model_loss(mtp_labels, mtp_logits)
         mtp_loss = loss_mask * mtp_loss
         if is_training:
+            # Safe divide without sync: mask numerator when num_tokens==0, divide by clamp(min=1)
             mtp_loss_for_log = (
-                torch.sum(mtp_loss) / num_tokens if num_tokens > 0 else mtp_loss.new_tensor(0.0)
-            )
+                torch.sum(mtp_loss) * (num_tokens > 0).to(mtp_loss.dtype)
+            ) / num_tokens.clamp(min=1)
             MTPLossLoggingHelper.save_loss_to_tracker(
                 mtp_loss_for_log,
                 mtp_layer_number,
@@ -790,6 +819,8 @@ def __init__(
                     f"The supported attention mask types are {SUPPORTED_ATTN_MASK}."
                 )
 
+        self.mhc_enabled = self.config.enable_hyper_connections
+
         self.enorm = self.submodules.enorm(
             config=self.config,
             hidden_size=self.config.hidden_size,
@@ -802,24 +833,58 @@ def __init__(
             eps=self.config.layernorm_epsilon,
         )
 
-        # For the linear projection at the (k - 1)-th MTP layer, the input is the concatenation
-        # of the i-th token's hidden states and the (i + K)-th token's decoder input,
-        # so the input's shape is [s, b, 2*h].
-        # The output will be send to the following transformer layer,
-        # so the output's shape should be [s, b, h].
-        self.eh_proj = build_module(
-            self.submodules.eh_proj,
-            self.config.hidden_size * 2,
-            self.config.hidden_size,
-            config=self.config,
-            init_method=self.config.init_method,
-            gather_output=False,
-            bias=False,
-            skip_bias_add=False,
-            is_expert=False,
-            tp_comm_buffer_name="mtp_eh_proj",
-            tp_group=pg_collection.tp if pg_collection is not None else None,
-        )
+        if self.mhc_enabled:
+            # mHC mode: separate e_proj and h_proj, operating per-stream.
+            # e_proj: [h] -> [h], applied to embedding then broadcast across streams.
+            # h_proj: [h] -> [h], applied per-stream on hidden states.
+            self.e_proj = build_module(
+                self.submodules.e_proj,
+                self.config.hidden_size,
+                self.config.hidden_size,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+                tp_comm_buffer_name="mtp_e_proj",
+                tp_group=pg_collection.tp if pg_collection is not None else None,
+            )
+            self.h_proj = build_module(
+                self.submodules.h_proj,
+                self.config.hidden_size,
+                self.config.hidden_size,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+                tp_comm_buffer_name="mtp_h_proj",
+                tp_group=pg_collection.tp if pg_collection is not None else None,
+            )
+            self.eh_proj = None
+        else:
+            # For the linear projection at the (k - 1)-th MTP layer, the input is the concatenation
+            # of the i-th token's hidden states and the (i + K)-th token's decoder input,
+            # so the input's shape is [s, b, 2*h].
+            # The output will be send to the following transformer layer,
+            # so the output's shape should be [s, b, h].
+            self.eh_proj = build_module(
+                self.submodules.eh_proj,
+                self.config.hidden_size * 2,
+                self.config.hidden_size,
+                config=self.config,
+                init_method=self.config.init_method,
+                gather_output=False,
+                bias=False,
+                skip_bias_add=False,
+                is_expert=False,
+                tp_comm_buffer_name="mtp_eh_proj",
+                tp_group=pg_collection.tp if pg_collection is not None else None,
+            )
+            self.e_proj = None
+            self.h_proj = None
 
         # Build inner layers: two possible paths
         # 1. Hybrid path: use HybridStack for hybrid pattern support
@@ -858,6 +923,19 @@ def __init__(
             hidden_size=self.config.hidden_size,
             eps=self.config.layernorm_epsilon,
         )
+
+        if self.mhc_enabled:
+            hc_mult = self.config.num_residual_streams
+            hc_dim = self.config.hidden_size * hc_mult
+            self.hc_head_fn = nn.Parameter(torch.randn(hc_mult, hc_dim))
+            self.hc_head_base = nn.Parameter(torch.zeros(hc_mult))
+            self.hc_head_scale = nn.Parameter(torch.ones(1))
+            nn.init.xavier_uniform_(self.hc_head_fn)
+            if self.config.sequence_parallel:
+                setattr(self.hc_head_fn, 'sequence_parallel', True)
+                setattr(self.hc_head_base, 'sequence_parallel', True)
+                setattr(self.hc_head_scale, 'sequence_parallel', True)
+
         self.offload_context = nullcontext()
 
     def _get_embeddings(
@@ -911,21 +989,51 @@ def _concat_embeddings(self, hidden_states: torch.Tensor, decoder_input: torch.T
         """
         decoder_input = apply_module(self.enorm)(decoder_input)
         decoder_input = make_viewless_tensor(inp=decoder_input, requires_grad=True, keep_graph=True)
-        hidden_states = apply_module(self.hnorm)(hidden_states)
-        hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
-        # At the (k - 1)-th MTP module, concatenates the i-th token's hidden_states
-        # and the (i + K)-th token's embedding, and combine them with linear projection.
-        hidden_states = torch.cat((decoder_input, hidden_states), -1)
-        hidden_states, _ = self.eh_proj(hidden_states)
-        # For tensor parallel we need to gather the tensor across the model-parallel
-        # ranks after the linear projection. This used to call
-        # `all_gather_last_dim_from_tensor_parallel_region`, but that utility reduces
-        # the gradient in backward pass and was therefore incorrect in this context.
-        # It has been replaced with the correct `gather_from_tensor_model_parallel_region`.
-        hidden_states = gather_from_tensor_model_parallel_region(hidden_states, group=self.tp_group)
-        # For sequence parallel, scatter after linear_fc and before transformer layer.
-        if self.sequence_parallel:
-            hidden_states = scatter_to_sequence_parallel_region(hidden_states, group=self.tp_group)
+
+        if self.mhc_enabled:
+            n = self.config.num_residual_streams
+            h = self.config.hidden_size
+            # hidden_states is [s, b, n*h] (multi-stream).
+            # hnorm operates per-stream on the h dimension.
+            s, b, _ = hidden_states.shape
+            hs_streams = hidden_states.view(s, b, n, h)
+            hs_streams = apply_module(self.hnorm)(hs_streams)
+            hs_streams = make_viewless_tensor(inp=hs_streams, requires_grad=True, keep_graph=True)
+            # e_proj: [s, b, h] -> [s, b, h], then broadcast to [s, b, n, h]
+            e_out, _ = self.e_proj(decoder_input)
+            e_out = gather_from_tensor_model_parallel_region(e_out, group=self.tp_group)
+            e_out = e_out.unsqueeze(2).expand(s, b, n, h)
+            # h_proj: applied per-stream on the h dimension
+            h_out, _ = self.h_proj(hs_streams)
+            h_out = gather_from_tensor_model_parallel_region(h_out, group=self.tp_group)
+            # Combine and flatten back to [s, b, n*h]
+            hidden_states = (e_out + h_out).reshape(s, b, n * h)
+            if self.sequence_parallel:
+                hidden_states = scatter_to_sequence_parallel_region(
+                    hidden_states, group=self.tp_group
+                )
+        else:
+            hidden_states = apply_module(self.hnorm)(hidden_states)
+            hidden_states = make_viewless_tensor(
+                inp=hidden_states, requires_grad=True, keep_graph=True
+            )
+            # At the (k - 1)-th MTP module, concatenates the i-th token's hidden_states
+            # and the (i + K)-th token's embedding, and combine them with linear projection.
+            hidden_states = torch.cat((decoder_input, hidden_states), -1)
+            hidden_states, _ = self.eh_proj(hidden_states)
+            # For tensor parallel we need to gather the tensor across the model-parallel
+            # ranks after the linear projection. This used to call
+            # `all_gather_last_dim_from_tensor_parallel_region`, but that utility reduces
+            # the gradient in backward pass and was therefore incorrect in this context.
+            # It has been replaced with the correct `gather_from_tensor_model_parallel_region`.
+            hidden_states = gather_from_tensor_model_parallel_region(
+                hidden_states, group=self.tp_group
+            )
+            # For sequence parallel, scatter after linear_fc and before transformer layer.
+            if self.sequence_parallel:
+                hidden_states = scatter_to_sequence_parallel_region(
+                    hidden_states, group=self.tp_group
+                )
         return hidden_states
 
     def _proj_and_transformer_layer(
@@ -993,7 +1101,8 @@ def _proj_and_transformer_layer(
                         sequence_len_offset=sequence_len_offset,
                     )
 
-        hidden_states = self._postprocess(hidden_states)
+        if not self.mhc_enabled:
+            hidden_states = self._postprocess(hidden_states)
 
         return hidden_states
 
@@ -1002,6 +1111,16 @@ def _postprocess(self, hidden_states: torch.Tensor):
         Postprocesses the output of the transformer layers.
         """
 
+        if self.mhc_enabled:
+            hidden_states = learned_output_contract(
+                hidden_states,
+                self.hc_head_fn,
+                self.hc_head_base,
+                self.hc_head_scale,
+                self.config.num_residual_streams,
+                self.config.layernorm_epsilon,
+            )
+
         # Layer norm before shared head layer.
         hidden_states = apply_module(self.final_layernorm)(hidden_states)
         # TENorm produces a "viewed" tensor. This will result in schedule.py's
@@ -1136,6 +1255,9 @@ def forward(
             [s, b, h], and optionally the updated context tensor if cross-attention is used.
         """
         assert context is None, "multi token prediction + cross attention is not yet supported."
+        _orig_cp_group = self.cp_group
+        if packed_seq_params is not None and packed_seq_params.cp_group is not None:
+            self.cp_group = packed_seq_params.cp_group
         input_ids, position_ids, decoder_input, hidden_states = self._get_embeddings(
             input_ids=input_ids,
             position_ids=position_ids,
@@ -1176,6 +1298,7 @@ def forward(
                 sequence_len_offset=sequence_len_offset,
             )
 
+        self.cp_group = _orig_cp_group
         return hidden_states, input_ids, position_ids
 
     def sharded_state_dict(
@@ -1434,6 +1557,7 @@ def forward(
         sequence_len_offset: Optional[Tensor] = None,
         extra_block_kwargs: Optional[dict] = None,
         embedding=None,
+        mhc_multistream: Optional[Tensor] = None,
     ) -> Tensor:
         """
         Perform the forward pass through all of the MTP modules.
@@ -1441,6 +1565,9 @@ def forward(
         Args:
             hidden_states (Tensor): Hidden states for input token with the shape [s, b, h]
                 where s is the sequence length, b is the batch size, and h is the hidden size.
+                Contracted decoder hidden states [s, b, h] when mHC is enabled.
+            mhc_multistream (Tensor, optional): When mHC is enabled, the pre-contraction
+                multi-stream decoder output [s, b, n*h] used as input to MTP depths.
             attention_mask (Tensor): Boolean tensor of shape [1, 1, s, s] for masking
                 self-attention.
 
@@ -1450,7 +1577,12 @@ def forward(
         # get hidden states from previous mtp stages
         offset = get_mtp_layer_offset(self.config, self.vp_stage)
         hidden_states_list = list(torch.chunk(hidden_states, 1 + offset, dim=0))
-        hidden_states = hidden_states_list[offset]
+        if mhc_multistream is not None:
+            # mHC mode: use multi-stream for MTP depth input, contracted for loss list.
+            mhc_chunks = list(torch.chunk(mhc_multistream, 1 + offset, dim=0))
+            hidden_states = mhc_chunks[offset]
+        else:
+            hidden_states = hidden_states_list[offset]
         for iteration in range(self.config.mtp_num_layers):
             layer_idx = 0 if self.mtp_use_repeated_layer else iteration
             (hidden_states, input_ids, position_ids) = self.layers[layer_idx](
@@ -1468,9 +1600,13 @@ def forward(
                 **(extra_block_kwargs or {}),
             )
 
-            # append the output hidden states of the current mtp layer
-            # to the hidden_states_list
-            hidden_states_list.append(hidden_states)
+            if mhc_multistream is not None:
+                mhc_chunks.append(hidden_states)
+                hidden_states_list.append(self.layers[layer_idx]._postprocess(hidden_states))
+            else:
+                # append the output hidden states of the current mtp layer
+                # to the hidden_states_list
+                hidden_states_list.append(hidden_states)
 
         # concat the hidden states of all mtp layers
         hidden_states = torch.cat(hidden_states_list, dim=0)
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
index 8bea3b8c94e..5189ce42823 100755
--- a/megatron/core/transformer/transformer_block.py
+++ b/megatron/core/transformer/transformer_block.py
@@ -1,10 +1,12 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import logging
 from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import List, Optional, Set, Union, cast
+from typing import List, Optional, Set, Tuple, Union, cast
 
 import torch
+import torch.nn as nn
 from torch import Tensor
 
 from megatron.core import parallel_state, tensor_parallel
@@ -19,7 +21,12 @@
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.pipeline_parallel.utils import is_vp_first_stage, is_vp_last_stage
 from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.random import CheckpointManager
 from megatron.core.transformer.enums import CudaGraphScope, LayerType
+from megatron.core.transformer.hyper_connection import (
+    HyperConnectionModule,
+    learned_output_contract,
+)
 from megatron.core.transformer.module import GraphableMegatronModule, MegatronModule
 from megatron.core.transformer.spec_utils import ModuleSpec, build_module
 from megatron.core.transformer.torch_norm import LayerNormBuilder
@@ -28,7 +35,7 @@
     BaseTransformerLayer,
     get_transformer_layer_offset,
 )
-from megatron.core.transformer.utils import sharded_state_dict_default
+from megatron.core.transformer.utils import sharded_state_dict_default, make_sharded_tensors_for_checkpoint
 from megatron.core.typed_torch import apply_module, not_none
 from megatron.core.utils import (
     WrappedTensor,
@@ -319,6 +326,7 @@ def __init__(
             self.offload_context, self.group_prefetch_offload_commit_async = nullcontext(), None
             self.config._cpu_offloading_context = None
 
+        self.num_residual_streams = config.num_residual_streams
         self._build_layers()
         self.num_layers_per_pipeline_rank = len(self.layers)
 
@@ -377,6 +385,17 @@ def build_layer(layer_spec, layer_number):
                 hidden_size=self.config.hidden_size,
                 eps=self.config.layernorm_epsilon,
             )
+            if self.config.enable_hyper_connections:
+                hc_mult = self.config.num_residual_streams
+                hc_dim = self.config.hidden_size * hc_mult
+                self.hc_head_fn = nn.Parameter(torch.randn(hc_mult, hc_dim))
+                self.hc_head_base = nn.Parameter(torch.zeros(hc_mult))
+                self.hc_head_scale = nn.Parameter(torch.ones(1))
+                nn.init.xavier_uniform_(self.hc_head_fn)
+                if self.config.sequence_parallel:
+                    setattr(self.hc_head_fn, 'sequence_parallel', True)
+                    setattr(self.hc_head_base, 'sequence_parallel', True)
+                    setattr(self.hc_head_scale, 'sequence_parallel', True)
         else:
             self.final_layernorm = None  # Either this or nn.Identity
 
@@ -386,7 +405,6 @@ def build_layer(layer_spec, layer_number):
     def has_final_layernorm_in_this_stage(self):
         """
         Check if this vpp stage contains the final layernorm.
-
         Note:
             Final layernorm now has been moved from the post-process stage to the last decoder
             layer by using this function.
@@ -452,6 +470,7 @@ def _checkpointed_forward(
         padding_mask: Optional[Tensor] = None,
         extract_layer_indices: Optional[Set[int]] = None,
         layer_offset: int = 0,
+        input_ids: Optional[Tensor] = None,
     ):
         """Forward method with activation checkpointing.
 
@@ -510,6 +529,7 @@ def custom_forward(
                             inference_context=None,
                             packed_seq_params=packed_seq_params,
                             padding_mask=padding_mask,
+                            input_ids=input_ids,
                         )
                 return hidden_states, context
 
@@ -642,6 +662,46 @@ def __call__(self, *args, **kwargs):
             return super().__call__(*args, **kwargs)[0]
         return super().__call__(*args, **kwargs)
 
+    def _build_mhc_recompute_layer_plan(
+        self, use_mhc_recompute: bool
+    ) -> Tuple[List[Optional[CheckpointManager]], List[bool]]:
+        """Pre-build per-layer MHC recompute managers and block-end markers."""
+        num_layers = len(self.layers)
+        layer_managers: List[Optional[CheckpointManager]] = [None] * num_layers
+        is_recompute_block_end: List[bool] = [False] * num_layers
+
+        if not use_mhc_recompute or num_layers == 0:
+            return layer_managers, is_recompute_block_end
+
+        mhc_recompute_layer_num = self.config.mhc_recompute_layer_num
+        mhc_manager = CheckpointManager()
+
+        for l_no in range(num_layers):
+            is_last_in_transformer_block = l_no == num_layers - 1
+            is_last_in_recompute_block = is_last_in_transformer_block
+            if mhc_recompute_layer_num is not None:
+                is_last_in_recompute_block = is_last_in_transformer_block or (
+                    (l_no + 1) % mhc_recompute_layer_num == 0
+                )
+
+            layer_managers[l_no] = mhc_manager
+            is_recompute_block_end[l_no] = is_last_in_recompute_block
+
+            if is_last_in_recompute_block and not is_last_in_transformer_block:
+                mhc_manager = CheckpointManager()
+
+        return layer_managers, is_recompute_block_end
+
+    @staticmethod
+    def _finalize_mhc_recompute_layer(
+        mhc_manager: Optional[CheckpointManager],
+        hidden_states: Tensor,
+        is_last_in_recompute_block: bool,
+    ) -> None:
+        """Finalize MHC recompute state for the current layer when block ends."""
+        if mhc_manager is not None and is_last_in_recompute_block:
+            mhc_manager.discard_all_outputs_and_register_unified_recompute(hidden_states)
+
     def forward(
         self,
         hidden_states: Union[Tensor, WrappedTensor],
@@ -658,6 +718,7 @@ def forward(
         sequence_len_offset: Optional[Tensor] = None,
         padding_mask: Optional[Tensor] = None,
         extract_layer_indices: Optional[Set[int]] = None,
+        input_ids: Optional[Tensor] = None,
         *,
         inference_params: Optional[BaseInferenceContext] = None,
         dynamic_inference_decode_only: Optional[bool] = None,
@@ -751,6 +812,13 @@ def forward(
         #   is called here to be future-proof and corner-case-proof.
         hidden_states = make_viewless_tensor(inp=hidden_states, requires_grad=True, keep_graph=True)
 
+        # Expand hidden states for hyper connections at the start of the block
+        # Only expand at the first PP stage; subsequent stages receive n-stream from previous stage
+        if self.config.enable_hyper_connections and self.pre_process:
+            hidden_states = HyperConnectionModule.input_expand(
+                hidden_states, self.num_residual_streams
+            )  # [s, b, C] -> [s, b, n*C]
+
         if self.config.sequence_parallel:
             rng_context = tensor_parallel.get_cuda_rng_tracker().fork()
         else:
@@ -778,6 +846,18 @@ def forward(
             use_inner_quantization_context = False
             outer_quantization_context = nullcontext()
 
+        # Determine if MHC recompute should be used
+        # Only enable when: training mode AND hyper connections AND 'mhc' in recompute_modules
+        use_mhc_recompute = (
+            self.training
+            and self.config.enable_hyper_connections
+            and self.config.recompute_granularity == 'selective'
+            and "mhc" in self.config.recompute_modules
+        )
+        mhc_layer_managers, mhc_is_last_in_recompute_block = self._build_mhc_recompute_layer_plan(
+            use_mhc_recompute
+        )
+
         with rng_context, outer_quantization_context:
             # Forward pass.
             if self.config.recompute_granularity == 'full' and self.training:
@@ -793,6 +873,7 @@ def forward(
                     padding_mask=padding_mask,
                     extract_layer_indices=extract_layer_indices,
                     layer_offset=layer_offset,
+                    input_ids=input_ids,
                 )
                 # Handle return value from _checkpointed_forward
                 if len(extract_layer_indices) > 0:
@@ -818,6 +899,12 @@ def forward(
                     else:
                         inner_quantization_context = nullcontext()
 
+                    mhc_manager = mhc_layer_managers[l_no]
+                    if mhc_manager is not None:
+                        mhc_manager.is_last_layer_in_recompute_block = (
+                            mhc_is_last_in_recompute_block[l_no]
+                        )
+
                     with self.offload_context, inner_quantization_context:
                         hidden_states, context = layer(
                             hidden_states=hidden_states,
@@ -833,7 +920,14 @@ def forward(
                             packed_seq_params=packed_seq_params,
                             sequence_len_offset=sequence_len_offset,
                             padding_mask=padding_mask,
+                            mhc_recompute_manager=mhc_manager,
+                            input_ids=input_ids,
                         )
+                    self._finalize_mhc_recompute_layer(
+                        mhc_manager=mhc_manager,
+                        hidden_states=hidden_states,
+                        is_last_in_recompute_block=mhc_is_last_in_recompute_block[l_no],
+                    )
 
                     if (
                         torch.is_grad_enabled()
@@ -846,6 +940,25 @@ def forward(
                     if (l_no + layer_offset) in extract_layer_indices:
                         intermediate_hidden_states.append(hidden_states)
 
+        # Only contract if the final layer norm is in this stage
+        mhc_multistream = None
+        if self.config.enable_hyper_connections and self.has_final_layernorm_in_this_stage():
+            # When MTP is enabled, save pre-contraction multi-stream for MTP input.
+            if self.config.mtp_num_layers is not None:
+                assert (
+                    len(extract_layer_indices) == 0
+                ), "Feature extraction is not supported with mHC + MTP."
+                mhc_multistream = hidden_states
+            # [s, b, n*C] -> [s, b, C]
+            hidden_states = learned_output_contract(
+                hidden_states,
+                self.hc_head_fn,
+                self.hc_head_base,
+                self.hc_head_scale,
+                self.config.num_residual_streams,
+                self.config.layernorm_epsilon,
+            )
+
         # Final layer norm.
         if self.final_layernorm is not None:
             hidden_states = apply_module(self.final_layernorm)(cast(Tensor, hidden_states))
@@ -864,6 +977,11 @@ def forward(
         if len(extract_layer_indices) > 0:
             return hidden_states, intermediate_hidden_states
 
+        # When mHC + MTP, return both contracted [s,b,h] (for lm_head) and
+        # pre-contraction multi-stream [s,b,n*h] (for MTP input).
+        if mhc_multistream is not None:
+            return hidden_states, mhc_multistream
+
         return hidden_states
 
     def sharded_state_dict(
@@ -955,4 +1073,15 @@ def sharded_state_dict(
                     )
                 )
 
+        # Include direct nn.Parameter attributes (e.g. hc_head_fn/base/scale for
+        # learned output contraction) that are not sub-modules and would be missed
+        # by the named_children() loop above.
+        for pname, param in self._parameters.items():
+            if param is not None:
+                local_sd = {pname: param}
+                wrapped = make_sharded_tensors_for_checkpoint(
+                    local_sd, prefix, sharded_offsets=sharded_offsets,
+                )
+                sharded_state_dict.update(wrapped)
+
         return sharded_state_dict
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
index 40c1a745493..9792aabb905 100644
--- a/megatron/core/transformer/transformer_config.py
+++ b/megatron/core/transformer/transformer_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import logging
 import math
@@ -11,8 +11,9 @@
 
 from megatron.core.enums import Fp4Recipe, Fp8Recipe
 from megatron.core.quantization.quant_config import RecipeConfig
-from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
+from megatron.core.transformer.enums import AttnBackend, CudaGraphScope, LayerType
 from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout
+from megatron.core.utils import experimental_api
 
 from .._rank_utils import log_single_rank
 from ..fusions.fused_bias_geglu import quick_gelu
@@ -37,6 +38,7 @@
 
 
 @dataclass
+@experimental_api
 class TransformerConfig(ModelParallelConfig):
     """Configuration object for megatron-core transformers.
 
@@ -198,7 +200,7 @@ class TransformerConfig(ModelParallelConfig):
 
     activation_func_clamp_value: Optional[float] = None
     """Clamp the output of the linear_fc1 in the activation function. Only used when activation_func
-    is quick_gelu."""
+    is quick_gelu or weighted SwiGLU (MoE only)."""
 
     num_moe_experts: Optional[int] = None
     """Number of experts to use for MoE layer. When set, it replaces MLP with MoE layer. Set to None
@@ -242,6 +244,11 @@ class TransformerConfig(ModelParallelConfig):
     attention_output_gate: bool = False
     """Whether to apply output gate to the attention layers."""
 
+    rotary_base_per_layer: Optional[List[float]] = None
+    """Per-layer RoPE theta values. Length must equal num_layers. When set, each
+    SelfAttention layer creates its own RotaryEmbedding with the corresponding base;
+    the shared model-level rotary_pos_emb is not created."""
+
     test_mode: bool = False
     """Whether to run real-time tests."""
 
@@ -262,8 +269,10 @@ class TransformerConfig(ModelParallelConfig):
     ####################
     # attention variant
     ####################
-    experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa']] = None
-    """Type of attention variant to use. Currently support gated_delta_net and dsa."""
+    experimental_attention_variant: Optional[Literal['gated_delta_net', 'dsa', 'dsv4_hybrid']] = (
+        None
+    )
+    """Type of attention variant to use. Currently support gated_delta_net, dsa, and dsv4_hybrid."""
 
     ####################
     # DSA
@@ -284,9 +293,28 @@ class TransformerConfig(ModelParallelConfig):
     """Whether to use sparse DSA indexer loss. If True, the indexer loss will be computed using the
     top-k indices."""
 
+    ####################
+    # DeepSeek-v4 hybrid attention
+    ####################
+    csa_window_size: int = 128
+    """Sliding window size for compressed sparse attention."""
+
+    csa_compress_ratios: Optional[List[int]] = None
+    """Per-layer compress ratios, e.g. [0, 0, 4, 128, 4, 128, ...]."""
+
+    csa_compress_rotary_base: float = 40000.0
+    """RoPE base for compressed KV positions in compressed sparse attention."""
+
+    csa_dense_mode: bool = False
+    """Whether to use dense mode for compressed sparse attention. If True, the CSA indexer will be
+    disabled."""
+
     ####################
     # linear attention
     ####################
+    linear_attention_type: Optional[str] = None
+    """Type of linear attention to use.
+    Deprecated. Use experimental_attention_variant instead."""
     linear_attention_freq: Optional[Union[int, List[int]]] = None
     """Frequency between LA (linear attention) layers 
     and SDPA (scaled dot-product attention) layers.
@@ -446,6 +474,10 @@ class TransformerConfig(ModelParallelConfig):
     fused_single_qkv_rope: bool = False
     """If set, avoid splitting QKV before ROPE forward and avoid concatenating ROPE dgrads."""
 
+    use_transformer_engine_op_fuser: bool = False
+    """If True, submodules may use Transformer Engine's operation fuser
+    API to enable advanced fusions."""
+
     fused_residual_rmsnorm: bool = False
     """If True, fuses residual connection and RMSNorm backward pass when TE is used."""
 
@@ -482,7 +514,8 @@ class TransformerConfig(ModelParallelConfig):
 
     recompute_modules: Optional[List[str]] = None
     """The submodules to recompute.
-    choices: "core_attn", "moe_act", "layernorm", "mla_up_proj", "mlp", "moe", "shared_experts".
+    choices: "core_attn", "moe_act", "layernorm", "mla_up_proj", "mlp", "moe",
+             "shared_experts", "mhc".
     default: ["core_attn"].
     "core_attn": recompute the core attention part of the transformer layer.
     "moe_act": recompute the MoE MLP activation function.
@@ -491,7 +524,10 @@ class TransformerConfig(ModelParallelConfig):
     "mlp": recompute the dense MLP submodule.
     "moe": recompute the MoE layer.
     "shared_experts": recompute the shared experts in the MoE layer.
-    "moe_act", "layernorm", and "mla_up_proj" use output-discarding checkpointing,
+    "mhc": recompute HyperConnection intermediate activations via
+            CheckpointWithoutOutput + CheckpointManager. Requires
+            enable_hyper_connections=True. Cannot be used with "mlp".
+    "moe_act", "layernorm", "mla_up_proj", and "mhc" use output-discarding checkpointing,
     "core_attn", "mlp", "moe", and "shared_experts" use normal checkpointing.
     """
 
@@ -600,6 +636,10 @@ class TransformerConfig(ModelParallelConfig):
     """Python import path to a callable quantizer factory, e.g., package.module.quantizer_factory.
     Required when fp4_recipe is custom."""
 
+    high_priority_a2a_comm_stream: bool = False
+    """If True, the communication stream created by set_streams for combined 1f1b
+    a2a overlap is created with CUDA high priority."""
+
     ####################
     # MoE related
     ####################
@@ -616,7 +656,7 @@ class TransformerConfig(ModelParallelConfig):
     in the hidden_states gradient."""
 
     moe_shared_expert_gate: bool = False
-    """Enable gate for shared expert. Only effective when 
+    """Enable gate for shared expert. Only effective when
     moe-shared-expert-intermediate-size is set."""
 
     moe_shared_expert_overlap: bool = False
@@ -725,12 +765,43 @@ class TransformerConfig(ModelParallelConfig):
     If negative, generates bias once per layer and reuses it (abs value is std).
     This is an experimental feature for benchmarking purposes."""
 
+    moe_n_hash_layers: int = 0
+    """Number of leading transformer layers that use hash-based MoE routing.
+    Layers with layer_number <= moe_n_hash_layers use a pre-computed tid2eid
+    lookup table for expert selection instead of learned top-k routing."""
+
+    actual_vocab_size: Optional[int] = None
+    """Padded actual vocabulary size. Required when moe_n_hash_layers > 0 for the
+    tid2eid lookup buffer in hash-based MoE routing."""
+
+    dense_grouped_gemm: bool = False
+    """Use GroupedLinear(num_groups=1) for dense MLP to trigger the
+    ForwardGroupedMLP_CuTeGEMMSwiGLU_MXFP8 fusion on SM100+ with MXFP8 recipe.
+    Requires ``use_te_op_fuser=True`` and SwiGLU activation.
+    """
+
+    log_moe_overload_factor: bool = False
+    """When True, log MoE overload metrics (avg/max vs balanced token count per step; max cum
+    overload = peak cumulative actual tokens / peak cumulative balanced count over interleaved
+    fwd/bwd) to TensorBoard/W&B and console. Records tokens_per_expert.sum() after dispatch;
+    use for debugging."""
+
     moe_grouped_gemm: bool = False
     """When there are multiple experts per rank, compress multiple local (potentially small) gemms
     in a single kernel launch to improve the utilization and performance by leveraging the Grouped
     GEMM feature introduced since CUTLASS 2.8 (https://github.com/fanshiqing/grouped_gemm).
     """
 
+    moe_single_grouped_weight: bool = False
+    """When using TE GroupedLinear for MoE experts, store expert weights as a single grouped
+    parameter via Transformer Engine's `GroupedTensor`. Requires ``moe_grouped_gemm=True``.
+    """
+
+    moe_single_grouped_bias: bool = False
+    """When using TE GroupedLinear for MoE experts, store expert biases as a single grouped
+    parameter via Transformer Engine's `GroupedTensor`. Requires ``moe_grouped_gemm=True``
+    and ``add_bias_linear=True``."""
+
     moe_aux_loss_coeff: Union[float, List[float]] = 0.0
     """Scaling coefficient for the aux loss. A starting value of 1e-2 is recommended.
     If a list of load balancing types is provided for `moe_router_load_balancing_type`,
@@ -759,6 +830,9 @@ class TransformerConfig(ModelParallelConfig):
     Options are "deepep" and "hybridep". Currently only "hybridep" backend supports 
     the MNNVL case."""
 
+    moe_permute_fusion_into_hybridep: bool = False
+    """Fuse token rearrangement ops during token dispatching for HybridEP."""
+
     moe_per_layer_logging: bool = False
     """Enable per-layer logging for MoE, currently supports auxiliary loss and z loss."""
 
@@ -803,9 +877,35 @@ class TransformerConfig(ModelParallelConfig):
     moe_deepep_num_sms: int = 20
     """Number of SMs to use for DeepEP."""
 
-    moe_hybridep_num_sms: int = 16
-    """Number of SMs to use for HybridEP. In pure NVL scenarios,
-    16 SMs can generally achieve good bandwidth."""
+    moe_hybridep_num_sms: Optional[int] = None
+    """Number of SMs to use for HybridEP. None uses the default from DeepEP.
+    In pure NVL scenarios, 16 SMs can generally achieve good bandwidth."""
+
+    moe_hybridep_num_blocks_permute: Optional[int] = None
+    """Number of cuda threads blocks to use for permute part in HybridEP. 
+    If permute_fusion_into_hybridep is True, this is the number of sms 
+    to use for the permute part."""
+
+    moe_hybridep_num_blocks_unpermute: Optional[int] = None
+    """Number of cuda threads blocks to use for unpermute part in HybridEP. 
+    If permute_fusion_into_hybridep is True, this is the number of sms to 
+    use for the unpermute part."""
+
+    moe_hybridep_num_sms_preprocessing: int = 108
+    """Number of SMs to use for HybridEP preprocessing (metadata scan kernel)."""
+
+    moe_mlp_glu_interleave_size: Optional[int] = None
+    """When set, GLU activations in the MoE grouped MLP layer will use a
+    block interleaved format. Instead of interpreting the input tensor
+    as a concatenation of gates and linear units, it will be
+    interpreted as alternating blocks of gates and linear units.
+    This data format is experimental and primarily intended to enable
+    advanced fused kernels."""
+
+    moe_expert_rank_capacity_factor: Optional[float] = None
+    """moe_expert_rank_capacity_factor (float): The capacity factor for each expert rank. Tokens 
+    exceeding this budget will be dropped. None means no token will be dropped. 
+    The default is None."""
 
     ##################
     # Context Parallel
@@ -871,6 +971,45 @@ class TransformerConfig(ModelParallelConfig):
     When cuda_graph_impl is set to "local", "full_iteration" can be specified as cuda_graph_scope
     to enable whole iteration CUDA graph. All other values enable layerwise CUDA graph."""
 
+    ####################
+    # Hyper-Connection Configuration
+    ####################
+    enable_hyper_connections: bool = False
+    """Enable mHC residual connections."""
+
+    num_residual_streams: int = 4
+    """Number of residual streams (n in paper)."""
+
+    mhc_sinkhorn_iterations: int = 20
+    """Number of Sinkhorn-Knopp iterations for doubly stochastic projection."""
+
+    mhc_init_gating_factor: float = 0.01
+    """Initial value of Gating Factor (alpha in paper)."""
+
+    use_fused_mhc: bool = False
+    """Use cuTile fused kernels for mHC operations.
+
+    When True, attempts to replace the reference mHC modules (SinkhornKnopp,
+    H_aggregate, H_post_bda, ProjRms) with fused cuda.tile (cuTile) autograd
+    functions for better performance on supported GPUs.  Requires cuTile to be
+    installed; if cuTile is unavailable the flag is silently reset to False and
+    a warning is emitted.
+    """
+
+    mhc_recompute_layer_num: Optional[int] = None
+    """Number of layers per MHC recompute block.
+    
+    When set, every `mhc_recompute_layer_num` layers form a recompute block. The last layer
+    in each recompute block (i.e., layer_number % mhc_recompute_layer_num == 0 or the final
+    layer in the transformer block) will:
+    - NOT checkpoint its final MLP BDA
+    - Register the unified recompute hook on its MLP BDA output
+    - A new CheckpointManager is created for subsequent layers
+    
+    If None, all layers in the transformer block share a single recompute block.
+
+    Must be a positive integer when set."""
+
     ####################
     # miscellaneous
     ####################
@@ -988,6 +1127,9 @@ class TransformerConfig(ModelParallelConfig):
     """Transformer implementation to use.
     Options are 'transformer_engine' for Transformer Engine and 'local' for MCore."""
 
+    fallback_to_eager_attn: bool = False
+    """Whether to fallback to eager attention in TE implementation.
+    Suggested for when desired features are not available in TE implementation."""
     #####################################
     # Fine-grained Activation Offloading
     #####################################
@@ -1011,6 +1153,39 @@ class TransformerConfig(ModelParallelConfig):
     min_offloaded_tensor_size: int = 1024 * 1024
     """The minimum size of the tensor to be offloaded."""
 
+    delay_offload_until_cuda_graph: bool = False
+    """If True, delay the offload until the CUDA graph is executed for minimal CPU overhead.
+    For more details, see the documentation:
+    https://github.com/NVIDIA/Megatron-LM/blob/main/docs/user-guide/features/fine_grained_activation_offloading.md#cuda-graph-integration.
+    """
+
+    delta_offload_bytes_across_pp_ranks: int = 0
+    """Difference of offload bytes across PP ranks to balance the offload load.
+    For more details, see the documentation:
+    https://github.com/NVIDIA/Megatron-LM/blob/main/docs/user-guide/features/fine_grained_activation_offloading.md#tuning-parameters.
+    """
+
+    activation_offload_fraction: float = 1.0
+    """The fraction of the activation to be offloaded, which should be in range [0, 1]."""
+
+    moe_paged_stash: bool = False
+    """If True, enable paged stash for all routed-expert activations needed for backward"""
+
+    moe_paged_stash_page_size: int = 64
+    """Number of tokens per page for paged stash memory management."""
+
+    moe_paged_stash_buffer_size_factor_cuda: float = 1.10
+    """Scale factor for paged stash CUDA buffer allocation.
+
+    Sign selects sizing: positive = avg-based, negative = actual-max. Magnitude is headroom
+    (e.g. 1.10 = 10%)."""
+
+    moe_paged_stash_buffer_size_factor_cpu: float = 0.0
+    """Scale factor for paged stash host buffer. 0 disables host buffer.
+
+    Same sign convention as moe_paged_stash_buffer_size_factor_cuda: positive = avg-based,
+    negative = actual-max; scale = abs(factor)."""
+
     def __post_init__(self):
         """Python dataclass method that is used to modify attributes after initialization.
         See https://docs.python.org/3/library/dataclasses.html#post-init-processing for more
@@ -1064,31 +1239,40 @@ def __post_init__(self):
                 f"tensor_model_parallel_size ({self.tensor_model_parallel_size})."
             )
 
-        if self.experimental_attention_variant == "gated_delta_net":
+        if self.linear_attention_type is not None:
+            warnings.warn(
+                "linear_attention_type is deprecated, "
+                "use experimental_attention_variant instead."
+            )
+            self.experimental_attention_variant = self.linear_attention_type
+            self.linear_attention_type = None
+
+        if self.experimental_attention_variant in ["gated_delta_net"]:
             assert (
                 self.linear_attention_freq is not None
-            ), f"linear_attention_freq must be set for linear gated_delta_net."
+            ), f"linear_attention_freq must be set for linear attention."
 
-            # Check required parameters
-            assert (
-                self.linear_conv_kernel_dim is not None
-            ), "linear_conv_kernel_dim must be set for gated delta net."
-            assert (
-                self.linear_key_head_dim is not None
-            ), "linear_key_head_dim must be set for gated delta net."
-            assert (
-                self.linear_value_head_dim is not None
-            ), "linear_value_head_dim must be set for gated delta net."
-            assert (
-                self.linear_num_key_heads is not None
-            ), "linear_num_key_heads must be set for gated delta net."
-            assert (
-                self.linear_num_value_heads is not None
-            ), "linear_num_value_heads must be set for gated delta net."
-            assert self.linear_num_value_heads % self.linear_num_key_heads == 0, (
-                f"linear_num_value_heads ({self.linear_num_value_heads}) must be a multiple of "
-                f"linear_num_key_heads ({self.linear_num_key_heads})."
-            )
+            if self.experimental_attention_variant == "gated_delta_net":
+                # Check required parameters
+                assert (
+                    self.linear_conv_kernel_dim is not None
+                ), "linear_conv_kernel_dim must be set for gated delta net."
+                assert (
+                    self.linear_key_head_dim is not None
+                ), "linear_key_head_dim must be set for gated delta net."
+                assert (
+                    self.linear_value_head_dim is not None
+                ), "linear_value_head_dim must be set for gated delta net."
+                assert (
+                    self.linear_num_key_heads is not None
+                ), "linear_num_key_heads must be set for gated delta net."
+                assert (
+                    self.linear_num_value_heads is not None
+                ), "linear_num_value_heads must be set for gated delta net."
+                assert self.linear_num_value_heads % self.linear_num_key_heads == 0, (
+                    f"linear_num_value_heads ({self.linear_num_value_heads}) must be a multiple of "
+                    f"linear_num_key_heads ({self.linear_num_key_heads})."
+                )
 
             # Check tensor parallelism compatibility
             tp_cp_size = self.tensor_model_parallel_size * self.context_parallel_size
@@ -1102,6 +1286,22 @@ def __post_init__(self):
             )
         elif self.experimental_attention_variant == "dsa":
             pass
+        elif self.experimental_attention_variant == "dsv4_hybrid":
+            assert self.multi_latent_attention, "DSv4 Hybrid requires multi_latent_attention."
+            assert self.csa_compress_ratios is not None, "csa_compress_ratios must be set"
+            mtp_layers = self.mtp_num_layers or 0
+            expected_len = self.num_layers + mtp_layers
+            assert len(self.csa_compress_ratios) == expected_len, (
+                f"csa_compress_ratios length ({len(self.csa_compress_ratios)}) must equal "
+                f"num_layers + mtp_num_layers ({self.num_layers} + {mtp_layers} = {expected_len})"
+            )
+            assert all(
+                ratio in [0, 4, 128] for ratio in self.csa_compress_ratios
+            ), "csa_compress_ratios must be 0, 4, or 128"
+            assert (
+                self.tensor_model_parallel_size == 1
+            ), "DSv4 Hybrid Attention only supports TP size 1."
+            assert not self.qk_clip, "QK clipping is not supported with DSv4 Hybrid Attention."
 
         if self.fp8:
             # cannot support first last layer bf16 with delayed scaling
@@ -1239,6 +1439,20 @@ def __post_init__(self):
                     "Please set num_moe_experts or remove moe_ffn_hidden_size."
                 )
 
+        if self.moe_single_grouped_weight or self.moe_single_grouped_bias:
+            if not self.moe_grouped_gemm:
+                raise ValueError(
+                    "moe_single_grouped_weight and moe_single_grouped_bias require "
+                    "moe_grouped_gemm=True."
+                )
+            if not is_te_min_version("2.14.0"):
+                raise ValueError(
+                    "moe_single_grouped_weight and moe_single_grouped_bias require "
+                    f"transformer-engine>=2.14.0, but your version is {get_te_version()}."
+                )
+        if self.moe_single_grouped_bias and not self.add_bias_linear:
+            raise ValueError("moe_single_grouped_bias requires add_bias_linear=True.")
+
         if self.moe_enable_deepep:
             if self.moe_token_dispatcher_type != "flex":
                 raise ValueError("DeepEP backend is only supported with flex token dispatcher.")
@@ -1314,6 +1528,18 @@ def __post_init__(self):
                     "moe_expert_capacity_factor must be set to use moe_pad_expert_input_to_capacity"
                 )
 
+        if self.moe_expert_rank_capacity_factor is not None:
+            if not self.use_transformer_engine_op_fuser:
+                raise ValueError(
+                    "moe_expert_rank_capacity_factor requires use_transformer_engine_op_fuser to "
+                    "be enabled."
+                )
+            if self.moe_flex_dispatcher_backend != "hybridep":
+                raise ValueError(
+                    "moe_expert_rank_capacity_factor requires moe_flex_dispatcher_backend to be "
+                    "'hybridep'."
+                )
+
         if self.cpu_offloading and (
             self.cpu_offloading_num_layers < 0 or self.cpu_offloading_num_layers >= self.num_layers
         ):
@@ -1383,6 +1609,7 @@ def __post_init__(self):
                     "mlp",
                     "moe",
                     "shared_experts",
+                    "mhc",
                 }
                 invalid_modules = set(self.recompute_modules) - allowed_modules
                 assert not invalid_modules, (
@@ -1445,6 +1672,77 @@ def __post_init__(self):
             if "moe" not in self.recompute_modules:
                 self.recompute_modules.append("moe")
 
+        # Validation for "mhc" in recompute_modules
+        if self.recompute_granularity == "selective" and "mhc" in self.recompute_modules:
+            if not self.enable_hyper_connections:
+                raise ValueError(
+                    "'mhc' in recompute_modules requires enable_hyper_connections=True."
+                )
+            if "mlp" in self.recompute_modules:
+                raise ValueError(
+                    "'mhc' and 'mlp' in recompute_modules cannot be used together. "
+                    "They use different checkpoint mechanisms that may conflict."
+                )
+            if self.mhc_recompute_layer_num is not None and (
+                isinstance(self.mhc_recompute_layer_num, bool)
+                or not isinstance(self.mhc_recompute_layer_num, int)
+                or self.mhc_recompute_layer_num < 1
+            ):
+                raise ValueError(
+                    "mhc_recompute_layer_num must be a positive integer when "
+                    "'mhc' is in recompute_modules."
+                )
+            if self.fine_grained_activation_offloading and self.offload_modules:
+                # mHC checkpoints wrap input_layernorm (inside attn_norm offload context)
+                # and pre_mlp_layernorm (inside mlp_norm offload context). The unified
+                # recompute hook fires before GroupCommitFunction.backward() initializes
+                # the backward chunk, so tensor_pop hits a None chunk for these modules.
+                # Other offload modules (qkv_linear, core_attn, attn_proj, expert_fc1,
+                # moe_act) live inside self_attention/MLP which are NOT wrapped by mHC
+                # checkpoints, so they are safe to use with mHC recompute.
+                _MHC_CONFLICTING_OFFLOAD_MODULES = {"attn_norm", "mlp_norm"}
+                conflicting = _MHC_CONFLICTING_OFFLOAD_MODULES & set(self.offload_modules)
+                if conflicting:
+                    raise ValueError(
+                        f"'mhc' in recompute_modules is incompatible with "
+                        f"offload_modules {conflicting}. The mHC recompute hook fires "
+                        f"before the offloading backward chunk is initialized for these "
+                        f"modules, causing tensor_pop on a None chunk. Remove "
+                        f"{conflicting} from offload_modules or remove 'mhc' from "
+                        f"recompute_modules."
+                    )
+
+        if self.enable_hyper_connections and not (
+            self.recompute_granularity == "selective" and "mhc" in self.recompute_modules
+        ):
+            warnings.warn(
+                "HyperConnections are enabled but 'mhc' is not in "
+                "recompute_modules with selective recompute. Consider adding 'mhc' to "
+                "recompute_modules with selective recompute to reduce activation memory."
+            )
+
+        # Validation for use_fused_mhc
+        if self.use_fused_mhc:
+            if not self.enable_hyper_connections:
+                raise ValueError("use_fused_mhc requires enable_hyper_connections=True.")
+            try:
+                from megatron.core.fusions.fused_mhc_kernels import is_cutile_available
+
+                if not is_cutile_available():
+                    warnings.warn(
+                        "use_fused_mhc is enabled but cuda.tile (cuTile) is not installed. "
+                        "Falling back to reference mHC implementations.",
+                        UserWarning,
+                    )
+                    self.use_fused_mhc = False
+            except ImportError:
+                warnings.warn(
+                    "use_fused_mhc is enabled but fused_mhc_kernels module could not be "
+                    "imported. Falling back to reference mHC implementations.",
+                    UserWarning,
+                )
+                self.use_fused_mhc = False
+
         if self.fine_grained_activation_offloading:
             assert (
                 not self.cpu_offloading
@@ -1470,6 +1768,36 @@ def __post_init__(self):
                     "because the input of attn_proj is the output of core_attn, "
                     "which is needed in core_attn.backward()."
                 )
+            if self.recompute_granularity == "selective" and "moe" in self.recompute_modules:
+                offload_inside_moe = {"moe_act", "expert_fc1"} & set(self.offload_modules)
+                assert not offload_inside_moe, (
+                    f"Cannot offload {offload_inside_moe} while recomputing the entire MoE layer. "
+                    f"'moe' in recompute_modules wraps the full MoE forward in a checkpoint, "
+                    f"so offloading activations inside it is redundant and will cause errors. "
+                    f"Either remove 'moe' from --recompute-modules or remove "
+                    f"{offload_inside_moe} from --offload-modules."
+                )
+            assert (
+                self.min_offloaded_tensor_size >= 0
+            ), "min_offloaded_tensor_size must be non-negative."
+            assert (
+                self.activation_offload_fraction >= 0 and self.activation_offload_fraction <= 1
+            ), "activation_offload_fraction must be in range [0, 1]."
+            assert (
+                self.delta_offload_bytes_across_pp_ranks >= 0
+            ), "delta_offload_bytes_across_pp_ranks must be non-negative."
+        if self.moe_paged_stash:
+            assert not self.cpu_offloading, "moe_paged_stash cannot be enabled with cpu_offloading."
+            assert self.moe_expert_rank_capacity_factor is not None, (
+                "moe_paged_stash requires moe_expert_rank_capacity_factor to be set; "
+                "there is no need to use paged stashing without it."
+            )
+            moe_offload_conflict = {"expert_fc1", "moe_act"} & set(self.offload_modules)
+            assert not moe_offload_conflict, (
+                "When moe_paged_stash is enabled, offload_modules must not include "
+                f"expert_fc1 or moe_act (paged stash covers those activations). "
+                f"Remove: {moe_offload_conflict}"
+            )
 
         if (
             self.num_layers_in_first_pipeline_stage is not None
@@ -1713,6 +2041,24 @@ def __post_init__(self):
             if self.activation_func != F.silu or not self.gated_linear_unit:
                 raise ValueError("Storing activation input in FP8 is supported only for SwiGLU.")
 
+        if self.activation_func_clamp_value is not None:
+            # swiglu
+            if self.activation_func == F.silu and self.gated_linear_unit:
+                if self.num_moe_experts is None:
+                    raise ValueError(
+                        "activation_func_clamp_value for SwiGLU is only supported with MoE."
+                    )
+                if self.use_te_activation_func:
+                    raise ValueError(
+                        "use_te_activation_func must be False "
+                        "when activation_func_clamp_value is not None for SwiGLU"
+                    )
+                if self.use_transformer_engine_op_fuser:
+                    raise ValueError(
+                        "use_transformer_engine_op_fuser must be False "
+                        "when activation_func_clamp_value is not None for SwiGLU"
+                    )
+
         if self.apply_rope_fusion:
             if self.multi_latent_attention:
                 warnings.warn(
@@ -1842,6 +2188,38 @@ def __post_init__(self):
                 "'sqrtsoftplus', or unset --moe-router-enable-expert-bias."
             )
 
+        if self.moe_n_hash_layers > 0:
+            assert (
+                self.actual_vocab_size is not None
+            ), "actual_vocab_size must be set when moe_n_hash_layers > 0."
+            if self.pipeline_model_parallel_size > 1:
+                assert self.pipeline_model_parallel_layout is not None, (
+                    "pipeline_model_parallel_layout must be set when using hash MoE "
+                    "layers with pipeline parallelism (PP > 1)."
+                )
+                # The embedding is always in layout[0][0] (PP rank 0, VPP rank 0).
+                # All hash MoE layers must be in the same virtual pipeline stage.
+                embedding_stage = self.pipeline_model_parallel_layout.layout[0][0]
+                n_decoders_with_embedding = embedding_stage.count(LayerType.decoder)
+                assert self.moe_n_hash_layers <= n_decoders_with_embedding, (
+                    f"Currently, All hash MoE layers must be in the same virtual pipeline stage "
+                    f"as the embedding. The embedding stage has "
+                    f"{n_decoders_with_embedding} decoder layers, but "
+                    f"moe_n_hash_layers={self.moe_n_hash_layers}."
+                )
+            assert (
+                not self.overlap_moe_expert_parallel_comm
+            ), "overlap_moe_expert_parallel_comm does not support moe_n_hash_layers > 0 for now."
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                f"Hash MoE layer initialized with placeholder round-robin tid2eid. "
+                f"For real training, you MUST either (a) load tid2eid from a "
+                f"pre-trained DSv4 checkpoint, or (b) provide a frequency-aware "
+                f"initialization (e.g., Sinkhorn-balanced over token frequency). "
+                f"Round-robin will cause severe expert imbalance.",
+            )
+
         if self.num_moe_experts and self.fp8:
             # TE version below 1.7.0 will raise Error when handle zeros tokens for expert
             if not is_te_min_version("1.7.0.dev0"):
@@ -2070,6 +2448,22 @@ def __post_init__(self):
                             "moe_input_jitter_eps is not supported with graphed moe recomputation."
                         )
 
+            if self.fine_grained_activation_offloading:
+                assert self.cuda_graph_impl == "transformer_engine" or (
+                    self.cuda_graph_impl == "local" and self.cuda_graph_scope == "full_iteration"
+                ), (
+                    "fine-grained activation offloading is only supported with "
+                    "transformer_engine CUDA graph implementation or local CUDA graph "
+                    "implementation with full_iteration scope."
+                )
+                assert (
+                    CudaGraphScope.moe not in self.cuda_graph_scope
+                ), "Token-drop MoE is temporarily not supported with activation offloading."
+                assert self.cuda_graph_warmup_steps > 0, (
+                    "cuda_graph_warmup_steps must be greater than 0 when enabling "
+                    "fine-grained activation offloading."
+                )
+
         if self.moe_token_dispatcher_type in ["allgather"]:
             if self.variable_seq_lengths is True:
                 raise ValueError(
@@ -2145,14 +2539,15 @@ def __post_init__(self):
                 )
 
             if self.cuda_graph_impl != "none":
-                assert (
-                    self.cuda_graph_impl == "transformer_engine"
-                    and CudaGraphScope.moe not in self.cuda_graph_scope
-                    and CudaGraphScope.mlp not in self.cuda_graph_scope
-                ), (
-                    'CUDA graph scope on moe and mlp is not '
-                    'supported with overlap_moe_expert_parallel_comm'
-                )
+                if self.cuda_graph_impl == "transformer_engine":
+                    assert (
+                        self.cuda_graph_impl == "transformer_engine"
+                        and CudaGraphScope.moe not in self.cuda_graph_scope
+                        and CudaGraphScope.mlp not in self.cuda_graph_scope
+                    ), (
+                        'CUDA graph scope on moe and mlp is not '
+                        'supported with overlap_moe_expert_parallel_comm'
+                    )
 
         # Check delay_wgrad_compute compatibility
         if self.delay_wgrad_compute:
@@ -2219,6 +2614,12 @@ def __post_init__(self):
                 "2.3.0.dev0+39c0e70"
             ), "Must have at least TE version 2.3 or higher to use symmetric memory all reduce"
 
+        if self.rotary_base_per_layer is not None:
+            assert len(self.rotary_base_per_layer) == self.num_layers, (
+                f"rotary_base_per_layer length ({len(self.rotary_base_per_layer)}) "
+                f"must equal num_layers ({self.num_layers})"
+            )
+
         if self.no_rope_freq:
             assert not self.flash_decode, "flash_decode cannot be used with no_rope."
             if isinstance(self.no_rope_freq, int):
@@ -2236,6 +2637,25 @@ def __post_init__(self):
                     f"the number of layers ({self.num_layers})"
                 )
 
+        if self.fallback_to_eager_attn:
+            assert self.transformer_impl == "transformer_engine", (
+                f"fallback_to_eager_attn is only available with transformer_engine implementation,"
+                f" but got {self.transformer_impl=}."
+            )
+
+        if self.fallback_to_eager_attn or self.transformer_impl == "local":
+            if self.context_parallel_size > 1 and self.cp_comm_type is not None:
+                all_cp_comm_types_are_all_gather = (
+                    all(item == "all_gather" for item in self.cp_comm_type)
+                    if isinstance(self.cp_comm_type, list)
+                    else self.cp_comm_type == "all_gather"
+                )
+                if not all_cp_comm_types_are_all_gather:
+                    raise ValueError(
+                        f"fallback_to_eager_attn only supports all_gather communication type "
+                        f"for context parallelism, but got {self.cp_comm_type=} instead."
+                    )
+
         if self.transformer_impl == "inference_optimized":
             assert self.normalization == "RMSNorm"
             assert not self.layernorm_zero_centered_gamma
@@ -2243,12 +2663,6 @@ def __post_init__(self):
             assert not self.add_qkv_bias
             assert not self.use_kitchen
 
-        if self.experimental_attention_variant == "dsa":
-            assert (
-                self.context_parallel_size == 1
-            ), "Currently context parallelism is not supported by DSAttention!"
-            assert not self.apply_rope_fusion, "RoPE fusion is not supported for DSAttention"
-
         if self.inference_fuse_tp_communication:
             assert self.transformer_impl == "inference_optimized", (
                 "inference_fuse_tp_communication is only supported "
@@ -2269,8 +2683,43 @@ def __post_init__(self):
                 self.attention_backend == AttnBackend.flash
             ), "Batch invariant mode only supports FlashAttention"
 
+        if self.sequence_packing_scheduler is not None:
+            # Check TE version.
+            if not HAVE_PACKAGING:
+                raise ImportError(
+                    "packaging is not installed. Please install it with `pip install packaging`."
+                )
+            # TODO: remove this after we fix the convergence issue with TE < 2.9.
+            if not (
+                is_te_min_version("2.9.0") or get_te_version() == PkgVersion("2.9.0.dev0+5b3092a")
+            ):
+                raise ValueError(
+                    "SFT sequence packing requires Transformer Engine >= 2.9.0 "
+                    f"but got {get_te_version()} (TE < 2.9.0 may have convergence issues)."
+                )
+
+            # Needed for passing variable sequences between pp stages.
+            self.variable_seq_lengths = True
+
+            # TODO(tailaim): add support for other dispatcher types
+            assert self.moe_token_dispatcher_type == "alltoall", (
+                f"sequence_packing only supports moe_token_dispatcher_type='alltoall', "
+                f"got '{self.moe_token_dispatcher_type}'"
+            )
+
+            supported_schedulers = ['dp_balanced', 'default_dynamic_cp']
+            if (
+                self.sequence_packing_scheduler is not None
+                and self.sequence_packing_scheduler not in supported_schedulers
+            ):
+                raise ValueError(
+                    f"Unsupported scheduler: {self.sequence_packing_scheduler}. "
+                    f"Available schedulers: {supported_schedulers}"
+                )
+
 
 @dataclass
+@experimental_api
 class MLATransformerConfig(TransformerConfig):
     """Configuration object for megatron-core Multi-Latent Attention (MLA) transformers.
 
@@ -2285,10 +2734,12 @@ class MLATransformerConfig(TransformerConfig):
     """Rank of Query tensor's low rank representation."""
 
     kv_lora_rank: int = 512
-    """Rank of Key and Value tensors' low rank representation."""
+    """Rank of Key and Value tensors' low rank representation.
+       This is not used for DSv4 Hybrid Attention and will be overridden automatically."""
 
     qk_head_dim: int = 128
-    """Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim"""
+    """Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim
+       This is not used for DSv4 Hybrid Attention and will be overridden automatically."""
 
     qk_pos_emb_head_dim: int = 64
     """Dimension of the position embedding in the QK projection."""
@@ -2326,6 +2777,12 @@ class MLATransformerConfig(TransformerConfig):
     mscale_all_dim: float = 0.0
     """Mscale all dimensions for YaRN RoPE in Multi-Latent Attention, used by yarn."""
 
+    o_groups: int = 8
+    """Number of groups for grouped low-rank output projection (wo_a)."""
+
+    o_lora_rank: int = 1024
+    """Low-rank dimension per group for grouped output (wo_a). Used when o_groups > 0."""
+
     cache_mla_latents: bool = False
     """Cache the low dimensional tensors for MLA rather than full KV cache.
        This is only for the dynamic inference backend and requires that 
@@ -2344,6 +2801,21 @@ def __post_init__(self):
         if self.attention_output_gate:
             raise NotImplementedError("Output gate is not supported for MLA yet.")
 
+        # DSv4 hybrid: derive qk_head_dim and kv_lora_rank from v_head_dim and qk_pos_emb_head_dim
+        if self.experimental_attention_variant == "dsv4_hybrid":
+            assert (
+                not self.mla_down_proj_fusion
+            ), "MLA down projection fusion must be disabled for DSv4 hybrid mode."
+            log_single_rank(
+                logger,
+                logging.WARNING,
+                f"DSv4 hybrid mode is enabled, deriving qk_head_dim and kv_lora_rank from "
+                f"v_head_dim and qk_pos_emb_head_dim",
+            )
+            derived = self.v_head_dim - self.qk_pos_emb_head_dim
+            self.qk_head_dim = derived
+            self.kv_lora_rank = derived
+
         if self.cache_mla_latents:
             assert (
                 self.apply_rope_fusion is False
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
index cf63199347c..271744b57a3 100644
--- a/megatron/core/transformer/transformer_layer.py
+++ b/megatron/core/transformer/transformer_layer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 from __future__ import annotations
 
 import functools
@@ -8,6 +8,9 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Any, Dict, Optional, Union
 
+if TYPE_CHECKING:
+    from megatron.core.tensor_parallel.random import CheckpointManager
+
 import torch
 import torch.distributed
 from torch import Tensor
@@ -30,6 +33,7 @@
     deprecate_inference_params,
     get_pg_rank,
     is_te_min_version,
+    is_torch_min_version,
     log_single_rank,
     make_viewless_tensor,
     nvtx_range_pop,
@@ -42,6 +46,16 @@
 logger = logging.getLogger(__name__)
 
 
+@functools.lru_cache(maxsize=None)
+def _get_offloading_interface():
+    """Get the offloading interface for fine-grained activation offloading."""
+    from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+        FineGrainedActivationOffloadingInterface,
+    )
+
+    return FineGrainedActivationOffloadingInterface
+
+
 def get_transformer_layer_offset(
     config: TransformerConfig, vp_stage: Optional[int] = None, pp_rank: Optional[int] = None
 ):
@@ -228,14 +242,17 @@ class TransformerLayerSubmodules:
     """
 
     input_layernorm: LayerNormBuilder = IdentityOp
+    self_attention_hyper_connection: Union[ModuleSpec, type] = IdentityOp
     self_attention: Union[ModuleSpec, type] = IdentityOp
     self_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
 
     pre_cross_attn_layernorm: LayerNormBuilder = IdentityOp
+    cross_attention_hyper_connection: Union[ModuleSpec, type] = IdentityOp
     cross_attention: Union[ModuleSpec, type] = IdentityOp
     cross_attn_bda: Union[ModuleSpec, type] = IdentityFuncOp
 
     pre_mlp_layernorm: LayerNormBuilder = IdentityOp
+    mlp_hyper_connection: Union[ModuleSpec, type] = IdentityOp
     mlp: Union[ModuleSpec, type] = IdentityOp
     mlp_bda: Union[ModuleSpec, type] = IdentityFuncOp
 
@@ -323,6 +340,8 @@ def __init__(
         attention_optional_kwargs["pg_collection"] = pg_collection
         if pp_layer_offset is not None:
             attention_optional_kwargs["pp_layer_offset"] = pp_layer_offset
+        if is_mtp_layer:
+            attention_optional_kwargs["is_mtp_layer"] = True
 
         # [Module 2: SelfAttention]
         self.self_attention = build_module(
@@ -373,9 +392,11 @@ def __init__(
         if isinstance(submodules.mlp, ModuleSpec):
             if submodules.mlp.module in (MoELayer, TEGroupedMLP, SequentialMLP):
                 additional_mlp_kwargs["pg_collection"] = pg_collection
-                # Pass is_mtp_layer flag to MoELayer to distinguish MTP MoE layers.
                 if submodules.mlp.module == MoELayer:
+                    # Pass is_mtp_layer flag to MoELayer to distinguish MTP MoE layers.
                     additional_mlp_kwargs["is_mtp_layer"] = self.is_mtp_layer
+                    # Pass layer number to MoELayer for router configuration.
+                    additional_mlp_kwargs["layer_number"] = self.layer_number
             elif submodules.mlp.module == MLP:
                 assert hasattr(
                     pg_collection, 'tp'
@@ -393,8 +414,6 @@ def __init__(
                     f"Unknown MLP type: {type(submodules.mlp)}. Using default kwargs.",
                 )
         self.mlp = build_module(submodules.mlp, config=self.config, **additional_mlp_kwargs)
-        if hasattr(self.mlp, 'set_layer_number'):
-            self.mlp.set_layer_number(self.layer_number)
 
         # [Module 9: BiasDropoutFusion]
         self.mlp_bda = build_module(submodules.mlp_bda)
@@ -471,17 +490,10 @@ def can_recompute_pre_mlp_layernorm_for_cudagraph():
             if "mlp" in self.config.recompute_modules:
                 if not self.is_moe_layer:
                     self.recompute_mlp = True
-        self.offload_attn_norm = (
-            self.config.fine_grained_activation_offloading
-            and "attn_norm" in self.config.offload_modules
-            and not isinstance(self.input_layernorm, IdentityOp)
-        )
-        self.offload_mlp_norm = (
-            self.config.fine_grained_activation_offloading
-            and "mlp_norm" in self.config.offload_modules
-            and not isinstance(self.pre_mlp_layernorm, IdentityOp)
-        )
 
+        self._set_offload_modules()
+        self.off_interface = _get_offloading_interface()
+        self.mlp_norm_manager = None
         # @jcasper how should we handle nvfuser?
         # Set bias+dropout+add fusion grad_enable execution handler.
         # TORCH_MAJOR = int(torch.__version__.split('.')[0])
@@ -540,6 +552,7 @@ def _forward_attention(
         packed_seq_params: Optional[PackedSeqParams] = None,
         sequence_len_offset: Optional[Tensor] = None,
         padding_mask: Optional[Tensor] = None,
+        input_ids: Optional[Tensor] = None,
         *,
         inference_params: Optional[Any] = None,
     ):
@@ -570,21 +583,18 @@ def _forward_attention(
                 context (Tensor): Updated context tensor if cross-attention is used,
                 otherwise None.
         """
-        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
-            FineGrainedActivationOffloadingInterface as off_interface,
-        )
-
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
         # Optional Input Layer norm
+        attn_norm_manager = self.off_interface(self.offload_attn_norm, hidden_states, "attn_norm")
         if self.recompute_input_layernorm:
             self.input_layernorm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-            with off_interface(self.offload_attn_norm, hidden_states, "attn_norm") as hidden_states:
+            with attn_norm_manager as hidden_states:
                 input_layernorm_output = self.input_layernorm_checkpoint.checkpoint(
                     apply_module(self.input_layernorm), hidden_states
                 )
         else:
-            with off_interface(self.offload_attn_norm, hidden_states, "attn_norm") as hidden_states:
+            with attn_norm_manager as hidden_states:
                 input_layernorm_output = apply_module(self.input_layernorm)(hidden_states)
 
         if isinstance(input_layernorm_output, tuple):
@@ -606,8 +616,6 @@ def _forward_attention(
         )
 
         if using_fused_tp_inference_kernel:
-            # Set the residual for fused reduce-scatter + add + layer-norm + all-gather
-            # operation in attention's out_proj (linear_proj)
             self._set_proj_residual(residual)
 
         # Self attention.
@@ -650,10 +658,9 @@ def _forward_attention(
 
         # Delay the offload of the attention norm until after the self_attn_bda has been computed
         # because the residual is needed in the self_attn_bda.
-        if self.offload_attn_norm:
-            hidden_states = off_interface.group_commit(
-                hidden_states, name="attn_norm", forced_released_tensors=[residual]
-            )
+        hidden_states = attn_norm_manager.group_offload(
+            hidden_states, forced_released_tensors=[residual]
+        )
 
         # Optional Layer norm after self-attention
         pre_cross_attn_layernorm_output = apply_module(self.pre_cross_attn_layernorm)(hidden_states)
@@ -700,27 +707,30 @@ def forward(self, *args, **kwargs):
         This method calls the core computation of a transformer layer, including
         self-attention, cross-attention (if applicable), and feed-forward operations.
         """
+        # Injected by __call__ for cuda graph keying; not a real forward arg.
+        kwargs.pop("dynamic_inference_decode_only", None)
+        assert (
+            not self.config.enable_hyper_connections
+        ), "Please use HyperConnectionTransformerLayer instead"
         hidden_states, context = self._forward_attention(*args, **kwargs)
         output = self._forward_mlp(
             hidden_states,
             kwargs.get("inference_context", None),
             padding_mask=kwargs.get("padding_mask", None),
+            input_ids=kwargs.get("input_ids", None),
         )
         return output, context
 
     def _forward_pre_mlp_layernorm(self, hidden_states: Tensor):
-        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
-            FineGrainedActivationOffloadingInterface as off_interface,
-        )
-
+        self.mlp_norm_manager = self.off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm")
         if self.recompute_pre_mlp_layernorm:
             self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput()
-            with off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm") as hidden_states:
+            with self.mlp_norm_manager as hidden_states:
                 pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint(
                     apply_module(self.pre_mlp_layernorm), hidden_states
                 )
         else:
-            with off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm") as hidden_states:
+            with self.mlp_norm_manager as hidden_states:
                 pre_mlp_layernorm_output = apply_module(self.pre_mlp_layernorm)(hidden_states)
 
         return pre_mlp_layernorm_output
@@ -730,6 +740,7 @@ def _forward_mlp(
         hidden_states: Tensor,
         inference_context: BaseInferenceContext | None = None,
         padding_mask: Tensor | None = None,
+        input_ids: Optional[Tensor] = None,
     ) -> Tensor | list[Tensor | None]:
         """
         Perform a forward pass through the feed-forward layer.
@@ -742,6 +753,8 @@ def _forward_mlp(
                 Shape [bsz, seq_length]. True = padding (exclude), False = valid (include).
                 Only used for MoE layers to exclude padding tokens from aux loss computations.
                 The MoELayer will internally transform this to [seq_length, bsz] format.
+            input_ids (Tensor, optional): The input IDs tensor. Shape [seq_length, bsz].
+                Only used for hash-based MoE routing. Defaults to None.
         Returns:
             output (Tensor): Transformed hidden states of shape [s, b, h].
         """
@@ -778,6 +791,10 @@ def _forward_mlp(
             self.config.inference_fuse_tp_communication
         )
 
+        moe_kwargs = {}
+        if self.is_moe_layer and input_ids is not None:
+            moe_kwargs["input_ids"] = input_ids
+
         if self.recompute_mlp:
             if self.config.fp8 or self.config.fp4:
                 # import here to avoid circular import
@@ -790,10 +807,11 @@ def _forward_mlp(
                     self.pg_collection.tp,
                     pre_mlp_layernorm_output,
                     padding_mask=padding_mask,
+                    **moe_kwargs,
                 )
             else:
                 mlp_output_with_bias = tensor_parallel.checkpoint(
-                    functools.partial(self.mlp, padding_mask=padding_mask),
+                    functools.partial(self.mlp, padding_mask=padding_mask, **moe_kwargs),
                     False,
                     pre_mlp_layernorm_output,
                 )
@@ -815,7 +833,9 @@ def _forward_mlp(
                 # Set the residual for fused reduce-scatter + add + layer-norm + all-gather
                 # operation in MLP's fc2.
                 self._set_fc2_residual(residual)
-            mlp_output_with_bias = self.mlp(pre_mlp_layernorm_output, padding_mask=padding_mask)
+            mlp_output_with_bias = self.mlp(
+                pre_mlp_layernorm_output, padding_mask=padding_mask, **moe_kwargs
+            )
 
         nvtx_range_pop(suffix="mlp")
 
@@ -850,9 +870,6 @@ def _forward_post_mlp(
         Returns:
             output (Tensor): Transformed hidden states of shape [s, b, h].
         """
-        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
-            FineGrainedActivationOffloadingInterface as off_interface,
-        )
 
         using_fused_tp_inference_kernel = (not self.training) and (
             self.config.inference_fuse_tp_communication
@@ -881,10 +898,11 @@ def _forward_post_mlp(
         nvtx_range_pop(suffix="mlp_bda")
         # Delay the offload of the mlp norm until after the mlp_bda has been computed
         # because the residual is needed in the mlp_bda.
-        if self.offload_mlp_norm:
-            hidden_states = off_interface.group_commit(
-                hidden_states, name="mlp_norm", forced_released_tensors=[residual]
+        if self.mlp_norm_manager is not None:
+            hidden_states = self.mlp_norm_manager.group_offload(
+                hidden_states, forced_released_tensors=[residual]
             )
+            self.mlp_norm_manager = None
 
         # Jit compiled function creates 'view' tensor. This tensor
         # potentially gets saved in the MPU checkpoint function context,
@@ -1001,6 +1019,19 @@ def get_layer_static_inputs(self, seq_length, micro_batch_size):
                 .reshape(1, 1, slen_per_cp, seq_length)
                 .tile(micro_batch_size, 1, 1, 1)
             )
+
+        # Add input_ids for hash-based MoE routing under CUDA graphs.
+        # Only add for layers that actually use hash routing,
+        # since other layers (e.g. on later PP stages) receive input_ids=None.
+        if (
+            self.is_moe_layer
+            and self.config.moe_n_hash_layers > 0
+            and getattr(self.mlp.router, 'is_hash_layer', False)
+        ):
+            static_inputs["input_ids"] = torch.zeros(
+                (micro_batch_size, seq_length), dtype=torch.long, device=torch.cuda.current_device()
+            )
+
         return static_inputs
 
     def _get_submodules_under_cudagraphs(self):
@@ -1039,6 +1070,18 @@ def _te_cuda_graph_capture(self, *args, **kwargs):
            attribute can be set to control the scope of the CUDA graph.
         2. If context is None, it cannot be returned as output.
         """
+        # Record the backward event on cuda graph stream in backward pass.
+        # This is to ensure the main stream waits for computing on cuda graph stream to complete,
+        # and overlaps with the H2D transfer on reload stream.
+        if self.offload_module_in_cuda_graph:
+            if len(args) > 0:
+                hidden_states = args[0]
+                hidden_states = self.off_interface.backward_record(hidden_states)
+                args = (hidden_states,) + args[1:]
+            else:
+                hidden_states = kwargs.pop("hidden_states")
+                hidden_states = self.off_interface.backward_record(hidden_states)
+                kwargs["hidden_states"] = hidden_states
         context = None
         if not self.config.cuda_graph_scope or CudaGraphScope.attn in self.config.cuda_graph_scope:
             hidden_states, context = self._forward_attention(*args, **kwargs)
@@ -1059,13 +1102,20 @@ def _te_cuda_graph_capture(self, *args, **kwargs):
                 )
             )
         ):
-            hidden_states = self._forward_mlp(hidden_states)
+            hidden_states = self._forward_mlp(
+                hidden_states, input_ids=kwargs.get("input_ids", None)
+            )
         if not isinstance(hidden_states, list) and not isinstance(hidden_states, tuple):
             cuda_graph_outputs = [hidden_states]
         else:
             cuda_graph_outputs = list(hidden_states)
         if context is not None:
             cuda_graph_outputs.append(context)
+        # Record the forward event on cuda graph stream for cuda graph capture.
+        # This is to ensure the main stream waits for computing on cuda graph stream to complete,
+        # and overlaps with the D2H transfer on offloading stream.
+        if self.offload_module_in_cuda_graph:
+            self.off_interface.forward_record()
         return tuple(cuda_graph_outputs)
 
     def _te_cuda_graph_replay(self, *args, **kwargs):
@@ -1089,8 +1139,25 @@ def _te_cuda_graph_replay(self, *args, **kwargs):
             "For inference cuda graph, please use cuda_graph_impl=local instead."
         )
 
+        if self.config.delay_offload_until_cuda_graph:
+            self.off_interface.enter_replay()
+
+        try:
+            return self._te_cuda_graph_replay_impl(args, kwargs, context)
+        finally:
+            if self.config.delay_offload_until_cuda_graph:
+                self.off_interface.exit_replay()
+
+    def _te_cuda_graph_replay_impl(self, args, kwargs, context):
+        """Implementation of _te_cuda_graph_replay, separated for replay mode cleanup."""
         cuda_graph_output = list(super()._te_cuda_graph_replay(*args, **kwargs))
 
+        # Flush delayed offload groups from previous layers after graph replay.
+        # The CPU is idle during the sync between graph replay and a2a comm,
+        # so we use that time to execute the delayed offload operations.
+        if self.config.delay_offload_until_cuda_graph:
+            self.off_interface.flush_delayed_groups()
+
         if kwargs.get('context') is not None:
             context = cuda_graph_output.pop()
 
@@ -1127,11 +1194,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs):
                     valid_cudagraph_attrs
                 ), f"attr_outputs: {len(attr_outputs)} != {len(valid_cudagraph_attrs)}"
                 for i, attr_name in enumerate(valid_cudagraph_attrs):
-                    hier_attr_name = attr_name.split('.')
-                    attr = self.mlp.token_dispatcher
-                    for name in hier_attr_name[:-1]:
-                        attr = getattr(attr, name)
-                    setattr(attr, hier_attr_name[-1], attr_outputs[i])
+                    self.mlp.token_dispatcher.set_cudagraph_attr(attr_name, attr_outputs[i])
             else:
                 # CUDA graph output is [hidden_states, probs, routing_map].
                 assert len(cuda_graph_output) == 3, (
@@ -1190,7 +1253,7 @@ def _te_cuda_graph_replay(self, *args, **kwargs):
                 return residual, hidden_states, probs, shared_expert_output
 
             # CUDA Graph does not capture the MLP/MoE part at all.
-            output = self._forward_mlp(*cuda_graph_output)
+            output = self._forward_mlp(*cuda_graph_output, input_ids=kwargs.get("input_ids", None))
         return output, context
 
     def _get_te_cuda_graph_replay_args(self, *args, **kwargs):
@@ -1280,6 +1343,110 @@ def _should_call_local_cudagraph(self, *args, **kwargs):
                 return True
         return False
 
+    def backward_dw_cudagraph(self, microbatch_idx):
+        """
+        CUDA Graph backward weight gradient computation for this layer.
+        """
+        cg_index = microbatch_idx % len(self.cuda_graphs)
+        if not hasattr(self.cuda_graphs[cg_index], 'backward_dw'):
+            return
+        self.cuda_graphs[cg_index].backward_dw()
+
+    def __call__(self, *args, **kwargs):
+        # Extract mhc_recompute_manager before CUDA graph manager processes kwargs,
+        # since CheckpointManager is not a CUDA-graph-supported type.
+        self._mhc_recompute_manager = kwargs.pop("mhc_recompute_manager", None)
+        kwargs.pop("is_last_layer_in_recompute_block", None)
+
+        if self._should_call_local_cudagraph(*args, **kwargs):
+            # Inference mode.
+            if kwargs.get('inference_context') is not None:
+                # dynamic_inference_decode_only is not a real argument to forward, it is only used
+                # to differentiate the cuda graph used for decode from the one used for non-decode
+                # inference.
+                kwargs["dynamic_inference_decode_only"] = kwargs[
+                    'inference_context'
+                ].is_decode_only()
+
+        return super().__call__(*args, **kwargs)
+
+    def _set_offload_modules(self):
+        """Set the offload modules for the transformer layer."""
+        if self.config.fine_grained_activation_offloading:
+            self.offload_attn_norm = "attn_norm" in self.config.offload_modules and not isinstance(
+                self.input_layernorm, IdentityOp
+            )
+            self.offload_qkv_linear = "qkv_linear" in self.config.offload_modules
+            self.offload_core_attn = "core_attn" in self.config.offload_modules
+            self.offload_attn_proj = "attn_proj" in self.config.offload_modules
+            self.offload_mlp_norm = "mlp_norm" in self.config.offload_modules and not isinstance(
+                self.pre_mlp_layernorm, IdentityOp
+            )
+            self.offload_expert_fc1 = "expert_fc1" in self.config.offload_modules
+            self.offload_moe_act = "moe_act" in self.config.offload_modules
+        else:
+            self.offload_attn_norm = False
+            self.offload_qkv_linear = False
+            self.offload_core_attn = False
+            self.offload_attn_proj = False
+            self.offload_mlp_norm = False
+            self.offload_expert_fc1 = False
+            self.offload_moe_act = False
+        # Check the compatibility of fine-grained activation offloading and cuda graph.
+        if self.config.fine_grained_activation_offloading:
+            if CudaGraphScope.attn in self.config.cuda_graph_scope:
+                self.offload_attn_norm = False
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    "attn_norm offloading is not supported with attn cudagraph. "
+                    "Disabling attn_norm offloading.",
+                )
+            mark_mlp_norm_offloading_not_supported = False
+            # For moe layer, mlp_norm offloading isn't supported with attn or moe_router cudagraph.
+            if self.is_moe_layer:
+                if (
+                    CudaGraphScope.attn in self.config.cuda_graph_scope
+                    or CudaGraphScope.moe_router in self.config.cuda_graph_scope
+                ):
+                    mark_mlp_norm_offloading_not_supported = True
+            # For non-moe layer, mlp_norm is the boundary of attn or mlp cudagraph.
+            # The only case where mlp_norm offloading is supported is when whole layer is captured.
+            elif (
+                CudaGraphScope.attn in self.config.cuda_graph_scope
+                and CudaGraphScope.mlp not in self.config.cuda_graph_scope
+            ) or (
+                CudaGraphScope.attn not in self.config.cuda_graph_scope
+                and CudaGraphScope.mlp in self.config.cuda_graph_scope
+            ):
+                mark_mlp_norm_offloading_not_supported = True
+            if mark_mlp_norm_offloading_not_supported:
+                self.offload_mlp_norm = False
+                log_single_rank(
+                    logger,
+                    logging.WARNING,
+                    "mlp_norm offloading is not supported with the current cudagraph scope. "
+                    "Disabling mlp_norm offloading.",
+                )
+        # Set the offload module in cuda graph flag.
+        self.offload_module_in_cuda_graph = False
+        if CudaGraphScope.attn in self.config.cuda_graph_scope:
+            if self.offload_core_attn or self.offload_attn_proj or self.offload_qkv_linear:
+                self.offload_module_in_cuda_graph = True
+        if not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope:
+            if self.offload_mlp_norm:
+                self.offload_module_in_cuda_graph = True
+        if self.offload_module_in_cuda_graph:
+            assert is_torch_min_version(
+                "2.9.0a0"
+            ), "Offloading modules captured in cuda graph requires torch>=2.9.0."
+            assert is_te_min_version(
+                "2.14.0"
+            ), "Offloading modules captured in cuda graph requires TE>=2.14.0."
+            assert (
+                self.config.cuda_graph_warmup_steps > 0
+            ), "Fine-grained activation offloading needs cuda_graph_warmup_steps > 0."
+
     def get_layer_norm_weights(self):
         """
         Get the weights of all layernorms (attention and MLP) in the transformer layer.
@@ -1289,6 +1456,501 @@ def get_layer_norm_weights(self):
         return
 
 
+class HyperConnectionTransformerLayer(TransformerLayer):
+    """A transformer layer with Manifold-Constrained Hyper-Connections (mHC).
+
+    Extends TransformerLayer by adding hyper connection modules around self-attention
+    and MLP. The n-stream hidden states are aggregated before each sub-layer and
+    expanded back afterwards using learned mappings (H_pre, H_post, H_res).
+
+    Cross-attention hyper connection is not supported.
+    """
+
+    def __init__(
+        self,
+        config: TransformerConfig,
+        submodules: TransformerLayerSubmodules,
+        layer_number: int = 1,
+        hidden_dropout: Optional[float] = None,
+        pg_collection: Optional[ProcessGroupCollection] = None,
+        vp_stage: Optional[int] = None,
+        is_mtp_layer: bool = False,
+    ):
+        super().__init__(
+            config=config,
+            submodules=submodules,
+            layer_number=layer_number,
+            hidden_dropout=hidden_dropout,
+            pg_collection=pg_collection,
+            vp_stage=vp_stage,
+            is_mtp_layer=is_mtp_layer,
+        )
+
+        if submodules.cross_attention_hyper_connection is not IdentityOp:
+            raise ValueError(
+                "HyperConnectionTransformerLayer does not support cross-attention "
+                "hyper connections. Use IdentityOp for cross_attention_hyper_connection."
+            )
+
+        assert submodules.self_attention_hyper_connection is not IdentityOp, (
+            "HyperConnectionTransformerLayer requires self_attention_hyper_connection. "
+            "Use TransformerLayer instead if hyper connections are not needed."
+        )
+        assert submodules.mlp_hyper_connection is not IdentityOp, (
+            "HyperConnectionTransformerLayer requires mlp_hyper_connection. "
+            "Use TransformerLayer instead if hyper connections are not needed."
+        )
+
+        self.self_attention_hyper_connection = build_module(
+            submodules.self_attention_hyper_connection,
+            config=self.config,
+            layer_number=self.layer_number,
+        )
+
+        self.mlp_hyper_connection = build_module(
+            submodules.mlp_hyper_connection, config=self.config, layer_number=self.layer_number
+        )
+
+        # When mHC recompute is active, skip checkpointing if the layernorm
+        # is IdentityOp (fused into TE linear) — there is nothing to recompute.
+        self.mhc_checkpoint_input_layernorm = not isinstance(self.input_layernorm, IdentityOp)
+        self.mhc_checkpoint_pre_mlp_layernorm = not isinstance(self.pre_mlp_layernorm, IdentityOp)
+
+    def get_layer_static_inputs(self, seq_length, micro_batch_size):
+        """Override to produce n-stream hidden_states of shape [s, b, n*C].
+
+        CUDA graph capture creates static buffers whose shapes are determined by
+        this method. The base class returns [s, b, C], but mHC layers operate on
+        n-stream hidden states of shape [s, b, n*C].
+        """
+        static_inputs = super().get_layer_static_inputs(seq_length, micro_batch_size)
+        hs = static_inputs["hidden_states"]
+        n = self.config.num_residual_streams
+        static_inputs["hidden_states"] = torch.ones(
+            (hs.shape[0], hs.shape[1], n * self.config.hidden_size),
+            dtype=hs.dtype,
+            requires_grad=hs.requires_grad,
+            device=hs.device,
+        )
+
+        # Add input_ids for hash-based MoE routing under CUDA graphs.
+        # Only add for layers that actually use hash routing,
+        # since other layers (e.g. on later PP stages) receive input_ids=None.
+        if (
+            self.is_moe_layer
+            and self.config.moe_n_hash_layers > 0
+            and getattr(self.mlp.router, 'is_hash_layer', False)
+        ):
+            static_inputs["input_ids"] = torch.zeros(
+                (micro_batch_size, seq_length), dtype=torch.long, device=torch.cuda.current_device()
+            )
+
+        return static_inputs
+
+    def _get_submodules_under_cudagraphs(self):
+        """Override to include hyper connection modules.
+
+        The base TransformerLayer._get_submodules_under_cudagraphs does not include
+        self_attention_hyper_connection / mlp_hyper_connection. Their learnable
+        parameters (mapping_proj, alpha_*, bias) need manual pre-forward hooks
+        during CUDA graph replay so that parameter all-gathers are triggered.
+        """
+        submodules = super()._get_submodules_under_cudagraphs()
+
+        if not self.config.cuda_graph_scope:
+            return submodules
+
+        if CudaGraphScope.attn in self.config.cuda_graph_scope:
+            submodules.append(self.self_attention_hyper_connection)
+        if (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope) or (
+            self.is_moe_layer
+            and (
+                CudaGraphScope.moe in self.config.cuda_graph_scope
+                or CudaGraphScope.moe_router in self.config.cuda_graph_scope
+            )
+        ):
+            submodules.append(self.mlp_hyper_connection)
+        return submodules
+
+    def forward(self, *args, **kwargs):
+        """Forward pass with MHC recompute manager support."""
+        kwargs.pop("dynamic_inference_decode_only", None)
+
+        mhc_recompute_manager = getattr(self, '_mhc_recompute_manager', None)
+
+        hidden_states, context = self._forward_attention(
+            *args, mhc_recompute_manager=mhc_recompute_manager, **kwargs
+        )
+
+        output = self._forward_mlp(
+            hidden_states,
+            kwargs.get("inference_context", None),
+            padding_mask=kwargs.get("padding_mask", None),
+            input_ids=kwargs.get("input_ids", None),
+            mhc_recompute_manager=mhc_recompute_manager,
+        )
+        return output, context
+
+    def _forward_attention(
+        self,
+        hidden_states: Tensor,
+        attention_mask: Optional[Tensor] = None,
+        context: Optional[Tensor] = None,
+        context_mask: Optional[Tensor] = None,
+        rotary_pos_emb: Optional[Tensor] = None,
+        rotary_pos_cos: Optional[Tensor] = None,
+        rotary_pos_sin: Optional[Tensor] = None,
+        rotary_pos_cos_sin: Optional[Tensor] = None,
+        attention_bias: Optional[Tensor] = None,
+        inference_context: Optional[Any] = None,
+        packed_seq_params: Optional[PackedSeqParams] = None,
+        sequence_len_offset: Optional[Tensor] = None,
+        padding_mask: Optional[Tensor] = None,
+        input_ids: Optional[Tensor] = None,
+        mhc_recompute_manager: Optional['CheckpointManager'] = None,
+        *,
+        inference_params: Optional[Any] = None,
+    ):
+        """Forward attention with hyper connection pre/post processing on self-attention."""
+        inference_context = deprecate_inference_params(inference_context, inference_params)
+
+        residual = hidden_states
+
+        nvtx_range_push(suffix="self_attention_hyper_connection")
+        hidden_states, self_attn_h_res, self_attn_hc_h_post = self.self_attention_hyper_connection(
+            hidden_states, mhc_recompute_manager=mhc_recompute_manager
+        )
+        nvtx_range_pop(suffix="self_attention_hyper_connection")
+
+        # Optional Input Layer norm
+        checkpoint_input_layernorm = self.recompute_input_layernorm or (
+            mhc_recompute_manager is not None and self.mhc_checkpoint_input_layernorm
+        )
+        attn_norm_manager = self.off_interface(self.offload_attn_norm, hidden_states, "attn_norm")
+        if checkpoint_input_layernorm:
+            self.input_layernorm_checkpoint = tensor_parallel.CheckpointWithoutOutput(
+                ckpt_manager=mhc_recompute_manager
+            )
+            with attn_norm_manager as hidden_states:
+                input_layernorm_output = self.input_layernorm_checkpoint.checkpoint(
+                    self.input_layernorm, hidden_states
+                )
+        else:
+            with attn_norm_manager as hidden_states:
+                input_layernorm_output = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        nvtx_range_push(suffix="self_attention")
+        attention_output_with_bias = self.self_attention(
+            input_layernorm_output,
+            attention_mask=attention_mask,
+            inference_context=inference_context,
+            rotary_pos_emb=rotary_pos_emb,
+            rotary_pos_cos=rotary_pos_cos,
+            rotary_pos_sin=rotary_pos_sin,
+            rotary_pos_cos_sin=rotary_pos_cos_sin,
+            attention_bias=attention_bias,
+            packed_seq_params=packed_seq_params,
+            sequence_len_offset=sequence_len_offset,
+        )
+        nvtx_range_pop(suffix="self_attention")
+
+        if checkpoint_input_layernorm:
+            self.input_layernorm_checkpoint.discard_output_and_register_recompute(
+                attention_output_with_bias[0]
+            )
+
+        nvtx_range_push(suffix="self_attention_fused_h_res_h_post_bda")
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.self_attention_hyper_connection.fused_h_res_h_post_bda(
+                self_attn_h_res,
+                residual,
+                self_attn_hc_h_post,
+                attention_output_with_bias,
+                self.hidden_dropout,
+                self.training,
+                self.config.bias_dropout_fusion,
+                mhc_recompute_manager,
+            )
+        nvtx_range_pop(suffix="self_attention_fused_h_res_h_post_bda")
+
+        hidden_states = attn_norm_manager.group_offload(hidden_states)
+
+        # Cross-attention (no hyper connection support).
+        residual = hidden_states
+        pre_cross_attn_layernorm_output = self.pre_cross_attn_layernorm(hidden_states)
+
+        attention_output_with_bias = self.cross_attention(
+            pre_cross_attn_layernorm_output,
+            attention_mask=context_mask,
+            key_value_states=context,
+            inference_context=inference_context,
+        )
+
+        if isinstance(attention_output_with_bias, dict) and "context" in attention_output_with_bias:
+            context = attention_output_with_bias["context"]
+
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.cross_attn_bda(self.training, self.config.bias_dropout_fusion)(
+                attention_output_with_bias, residual, self.hidden_dropout
+            )
+
+        return hidden_states, context
+
+    def _forward_mlp(
+        self,
+        hidden_states,
+        inference_context=None,
+        padding_mask=None,
+        input_ids=None,
+        mhc_recompute_manager: Optional['CheckpointManager'] = None,
+    ):
+        """Forward MLP with hyper connection pre/post processing."""
+        is_last_in_recompute_block = bool(
+            mhc_recompute_manager is not None
+            and getattr(mhc_recompute_manager, "is_last_layer_in_recompute_block", False)
+        )
+        mhc_mlp_bda_manager = None if is_last_in_recompute_block else mhc_recompute_manager
+
+        residual = hidden_states
+
+        nvtx_range_push(suffix="mlp_hyper_connection")
+        hidden_states, mlp_h_res, mlp_hc_h_post = self.mlp_hyper_connection(
+            hidden_states, mhc_recompute_manager=mhc_recompute_manager
+        )
+        nvtx_range_pop(suffix="mlp_hyper_connection")
+
+        # Optional Layer norm post the cross-attention.
+        checkpoint_pre_mlp_layernorm = self.recompute_pre_mlp_layernorm or (
+            mhc_recompute_manager is not None and self.mhc_checkpoint_pre_mlp_layernorm
+        )
+        self.mlp_norm_manager = self.off_interface(self.offload_mlp_norm, hidden_states, "mlp_norm")
+        if checkpoint_pre_mlp_layernorm:
+            self.pre_mlp_norm_checkpoint = tensor_parallel.CheckpointWithoutOutput(
+                ckpt_manager=mhc_recompute_manager
+            )
+            with self.mlp_norm_manager as hidden_states:
+                pre_mlp_layernorm_output = self.pre_mlp_norm_checkpoint.checkpoint(
+                    self.pre_mlp_layernorm, hidden_states
+                )
+        else:
+            with self.mlp_norm_manager as hidden_states:
+                pre_mlp_layernorm_output = self.pre_mlp_layernorm(hidden_states)
+
+        nvtx_range_push(suffix="mlp")
+        should_chunk_mlp_for_prefill = (
+            self.config.mlp_chunks_for_prefill > 1
+            and inference_context is not None
+            and not inference_context.is_decode_only()
+            and not isinstance(self.mlp, IdentityOp)
+            and not self.config.transformer_impl == "inference_optimized"
+        )
+
+        moe_kwargs = {}
+        if self.is_moe_layer and input_ids is not None:
+            moe_kwargs['input_ids'] = input_ids
+
+        if self.recompute_mlp:
+            if self.config.fp8 or self.config.fp4:
+                from megatron.core.extensions.transformer_engine import te_checkpoint
+
+                mlp_output_with_bias = te_checkpoint(
+                    self.mlp,
+                    False,
+                    tensor_parallel.random.get_cuda_rng_tracker,
+                    self.pg_collection.tp,
+                    pre_mlp_layernorm_output,
+                    padding_mask=padding_mask,
+                    **moe_kwargs,
+                )
+            else:
+                mlp_output_with_bias = tensor_parallel.checkpoint(
+                    functools.partial(self.mlp, padding_mask=padding_mask, **moe_kwargs),
+                    False,
+                    pre_mlp_layernorm_output,
+                )
+        elif should_chunk_mlp_for_prefill:
+            num_chunks = min(self.config.mlp_chunks_for_prefill, pre_mlp_layernorm_output.shape[0])
+            chunks = pre_mlp_layernorm_output.chunk(num_chunks, dim=0)
+            outputs = [self.mlp(chunk) for chunk in chunks]
+            mlp_output = torch.cat([out for out, _ in outputs], dim=0)
+            bias_chunks = [bias for _, bias in outputs if bias is not None]
+            bias_output = torch.stack(bias_chunks, dim=0).sum(dim=0) if bias_chunks else None
+            mlp_output_with_bias = (mlp_output, bias_output)
+        else:
+            mlp_output_with_bias = self.mlp(
+                pre_mlp_layernorm_output, padding_mask=padding_mask, **moe_kwargs
+            )
+
+        nvtx_range_pop(suffix="mlp")
+
+        # During TE CUDA graph partial MoE capture, skip HC post-processing and return
+        # intermediate outputs + HC state. The post-processing will be done during replay.
+        if (
+            self.is_moe_layer
+            and self.config.cuda_graph_impl == "transformer_engine"
+            and self.training
+            and is_graph_capturing()
+            and CudaGraphScope.moe_router in self.config.cuda_graph_scope
+        ):
+            if self.recompute_pre_mlp_layernorm or (
+                mhc_recompute_manager is not None and self.mhc_checkpoint_pre_mlp_layernorm
+            ):
+                for tensor in mlp_output_with_bias:
+                    self.pre_mlp_norm_checkpoint.discard_output_and_register_recompute(tensor)
+            # Append HC state (mlp_hc_h_post, mlp_h_res, residual) for replay.
+            return list(mlp_output_with_bias) + [mlp_hc_h_post, mlp_h_res, residual]
+
+        return self._forward_post_mlp_with_fused_hyper_connection(
+            mlp_output_with_bias, mlp_h_res, residual, mlp_hc_h_post, mhc_mlp_bda_manager
+        )
+
+    def _forward_post_mlp_with_fused_hyper_connection(
+        self,
+        mlp_output_with_bias,
+        mlp_h_res,
+        residual,
+        mlp_hc_h_post,
+        mhc_mlp_bda_recompute_manager: Optional['CheckpointManager'] = None,
+    ):
+        """
+        Perform operations after the MLP computation with fused hyper connection kernel.
+
+        This method uses the fused kernel combining apply_h_res, apply_h_post and bias-dropout-add.
+
+        Args:
+            mlp_output_with_bias (Tensor): Output tensor of the MLP layer with bias.
+            mlp_h_res (Tensor): [s, b, n, n] - residual mixing matrix from hyper connection.
+            residual (Tensor): [s, b, n*C] - original residual (n-stream hidden states).
+            mlp_hc_h_post (Tensor): [s, b, n] - expansion weights from hyper connection.
+            mhc_recompute_manager: Optional CheckpointManager for checkpoint management.
+
+        Returns:
+            output (Tensor): Transformed hidden states of shape [s, b, h].
+        """
+        if self.recompute_pre_mlp_layernorm or (
+            mhc_mlp_bda_recompute_manager is not None and self.mhc_checkpoint_pre_mlp_layernorm
+        ):
+            self.pre_mlp_norm_checkpoint.discard_output_and_register_recompute(
+                mlp_output_with_bias[0]
+            )
+
+        nvtx_range_push(suffix="mlp_fused_h_res_h_post_bda")
+        with self.bias_dropout_add_exec_handler():
+            hidden_states = self.mlp_hyper_connection.fused_h_res_h_post_bda(
+                mlp_h_res,
+                residual,
+                mlp_hc_h_post,
+                mlp_output_with_bias,
+                self.hidden_dropout,
+                self.training,
+                self.config.bias_dropout_fusion,
+                mhc_mlp_bda_recompute_manager,
+            )
+        nvtx_range_pop(suffix="mlp_fused_h_res_h_post_bda")
+
+        hidden_states = self.mlp_norm_manager.group_offload(hidden_states)
+
+        output = make_viewless_tensor(
+            inp=hidden_states, requires_grad=hidden_states.requires_grad, keep_graph=True
+        )
+        return output
+
+    def _te_cuda_graph_replay_impl(self, args, kwargs, context):
+        """Implementation of _te_cuda_graph_replay with hyper connection support.
+
+        Overrides the parent's _te_cuda_graph_replay_impl so that the
+        delay_offload_until_cuda_graph lifecycle (enter_replay/exit_replay) in
+        the parent's _te_cuda_graph_replay is preserved.
+
+        During MoE partial CUDA graph capture, the graph outputs include HC state
+        (mlp_hc_h_post, mlp_h_res) in addition to the base class outputs. This method
+        extracts the HC state and uses it for post-processing after resuming the MoE forward.
+        """
+        cuda_graph_output = list(
+            GraphableMegatronModule._te_cuda_graph_replay(self, *args, **kwargs)
+        )
+
+        # Flush delayed offload groups from previous layers after graph replay.
+        if self.config.delay_offload_until_cuda_graph:
+            self.off_interface.flush_delayed_groups()
+
+        if kwargs.get('context') is not None:
+            context = cuda_graph_output.pop()
+
+        if (
+            not self.config.cuda_graph_scope
+            or (not self.is_moe_layer and CudaGraphScope.mlp in self.config.cuda_graph_scope)
+            or (self.is_moe_layer and CudaGraphScope.moe in self.config.cuda_graph_scope)
+        ):
+            assert len(cuda_graph_output) == 1, "CUDA Graph output should be the layer output."
+            output = cuda_graph_output.pop()
+            assert (
+                not self.config.overlap_moe_expert_parallel_comm
+            ), "EP overlap must be \
+                disabled when CUDA graph captures the whole MLP/MoE part."
+        elif self.is_moe_layer and CudaGraphScope.moe_router in self.config.cuda_graph_scope:
+            # Pop HC state (appended during capture in _forward_mlp).
+            residual = cuda_graph_output.pop()
+            mlp_h_res = cuda_graph_output.pop()
+            mlp_hc_h_post = cuda_graph_output.pop()
+
+            shared_expert_output, routing_map = None, None
+            if (
+                self.config.moe_shared_expert_intermediate_size is not None
+                and not self.config.moe_shared_expert_overlap
+            ):
+                shared_expert_output = cuda_graph_output.pop()
+
+            if CudaGraphScope.moe_preprocess in self.config.cuda_graph_scope:
+                (hidden_states, probs), attr_outputs = (
+                    cuda_graph_output[:2],
+                    cuda_graph_output[2:],
+                )
+                valid_cudagraph_attrs = self.mlp.token_dispatcher.valid_cudagraph_attrs
+                assert len(attr_outputs) == len(
+                    valid_cudagraph_attrs
+                ), f"attr_outputs: {len(attr_outputs)} != {len(valid_cudagraph_attrs)}"
+                for i, attr_name in enumerate(valid_cudagraph_attrs):
+                    self.mlp.token_dispatcher.set_cudagraph_attr(attr_name, attr_outputs[i])
+            else:
+                assert len(cuda_graph_output) == 3, (
+                    "CUDA graph output should be [hidden_states, probs, routing_map], "
+                    f"but got {len(cuda_graph_output)} elements"
+                )
+                hidden_states, probs, routing_map = cuda_graph_output
+
+            # Resume the MoELayer forward pass from the end of the CUDA graph scope.
+            nvtx_range_push(suffix="mlp")
+            self.mlp.cudagraph_tensor_store.set(
+                hidden_states=hidden_states,
+                probs=probs,
+                routing_map=routing_map,
+                shared_expert_output=shared_expert_output,
+            )
+            # If EP overlap is enabled, remaining of mlp will be called as fine_grained_callables
+            # and should be skipped here.
+            if self.config.overlap_moe_expert_parallel_comm:
+                probs, routing_map = self.mlp.route(hidden_states)
+                hidden_states, probs = self.mlp.preprocess(hidden_states, probs, routing_map)
+                nvtx_range_pop(suffix="mlp")
+                return residual, hidden_states, probs, shared_expert_output
+            mlp_output_with_bias = self.mlp(hidden_states)
+            self.mlp.cudagraph_tensor_store.clear()
+            nvtx_range_pop(suffix="mlp")
+
+            # HC post-processing with fused h_res, h_post and BDA.
+            recompute_pre_mlp_layernorm = self.recompute_pre_mlp_layernorm
+            self.recompute_pre_mlp_layernorm = False
+            output = self._forward_post_mlp_with_fused_hyper_connection(
+                mlp_output_with_bias, mlp_h_res, residual, mlp_hc_h_post
+            )
+            self.recompute_pre_mlp_layernorm = recompute_pre_mlp_layernorm
+        else:
+            output = self._forward_mlp(*cuda_graph_output, input_ids=kwargs.get("input_ids", None))
+        return output, context
+
+
 class MoETransformerLayer(TransformerLayer):
     """
     A Transformer layer specialized for Mixture-of-Experts (MoE) architectures.
@@ -1383,7 +2045,7 @@ def create_mcore_cudagraph_manager(self, config):
         ):
             self.transition_cudagraph_scope('partial')
 
-    def _forward_mlp_router(self, hidden_states, padding_mask=None):
+    def _forward_mlp_router(self, hidden_states, padding_mask=None, input_ids=None):
         """
         Executes the router phase of the MoE block.
 
@@ -1408,14 +2070,14 @@ def _forward_mlp_router(self, hidden_states, padding_mask=None):
             residual = residual.float()
 
         router_outputs = self.mlp(
-            pre_mlp_layernorm_output, intermediate_tensors=(), padding_mask=padding_mask
+            pre_mlp_layernorm_output,
+            intermediate_tensors=(),
+            padding_mask=padding_mask,
+            input_ids=input_ids,
         )
 
         for attr_name in self.mlp.token_dispatcher.cudagraph_attrs:
-            hier_attr_name = attr_name.split('.')
-            attr = self.mlp.token_dispatcher
-            for name in hier_attr_name:
-                attr = getattr(attr, name)
+            attr = self.mlp.token_dispatcher.get_cudagraph_attr(attr_name)
             if torch.is_tensor(attr):
                 if attr_name in self.token_dispatcher_attrs:
                     self.token_dispatcher_attrs[attr_name].copy_(attr)
@@ -1433,12 +2095,8 @@ def _forward_mlp_expert_compute(self, hidden_states, probs):
         step runs eagerly between the router and postprocess graph replays.
         """
 
-        for attr_name, attr in self.token_dispatcher_attrs.items():
-            hier_attr_name = attr_name.split('.')
-            obj = self.mlp.token_dispatcher
-            for name in hier_attr_name[:-1]:
-                obj = getattr(obj, name)
-            setattr(obj, hier_attr_name[-1], attr)
+        for name, attr in self.token_dispatcher_attrs.items():
+            self.mlp.token_dispatcher.set_cudagraph_attr(name, attr)
 
         self.mlp.fwd_execution_map = "expert_compute"
         return self.mlp(None, intermediate_tensors=(hidden_states, probs))
@@ -1463,7 +2121,9 @@ def _forward_mlp_postprocess(self, residual, output, shared_expert_output, mlp_b
         output = self.mlp(None, intermediate_tensors=(output, shared_expert_output))
         return self._forward_post_mlp((output, mlp_bias), residual)
 
-    def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None):
+    def _forward_mlp(
+        self, hidden_states, inference_context=None, padding_mask=None, input_ids=None
+    ):
         """
         Orchestrates the MLP forward pass, handling partial CUDA graph execution logic.
 
@@ -1479,10 +2139,10 @@ def _forward_mlp(self, hidden_states, inference_context=None, padding_mask=None)
             )
 
         def _forward_mlp_partial_cudagraphs(
-            hidden_states, inference_context=None, padding_mask=None
+            hidden_states, inference_context=None, padding_mask=None, input_ids=None
         ):
             residual, hidden_states, probs, shared_expert_output = self._forward_mlp_router(
-                hidden_states, padding_mask=padding_mask
+                hidden_states, padding_mask=padding_mask, input_ids=input_ids
             )
 
             # After the router graph replays, the captured .copy_() operations that update
@@ -1511,16 +2171,23 @@ def _forward_mlp_partial_cudagraphs(
                         parallel_state.get_tensor_model_parallel_group(),
                         hidden_states,
                         padding_mask=padding_mask,
+                        input_ids=input_ids,
                     )
                 else:
                     return tensor_parallel.checkpoint(
                         functools.partial(
-                            _forward_mlp_partial_cudagraphs, padding_mask=padding_mask
+                            _forward_mlp_partial_cudagraphs,
+                            padding_mask=padding_mask,
+                            input_ids=input_ids,
                         ),
                         False,
                         hidden_states,
                     )
             else:
-                return _forward_mlp_partial_cudagraphs(hidden_states, padding_mask=padding_mask)
+                return _forward_mlp_partial_cudagraphs(
+                    hidden_states, padding_mask=padding_mask, input_ids=input_ids
+                )
         else:
-            return super()._forward_mlp(hidden_states, padding_mask=padding_mask)
+            return super()._forward_mlp(
+                hidden_states, padding_mask=padding_mask, input_ids=input_ids
+            )
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
index 3fac8fdafff..b7d82d260d9 100644
--- a/megatron/core/utils.py
+++ b/megatron/core/utils.py
@@ -932,6 +932,12 @@ def make_tp_sharded_tensor_for_checkpoint(
     # Pop group parameters from kwargs
     tp_group = kwargs.pop('tp_group', None)
     dp_cp_group = kwargs.pop('dp_cp_group', None)
+    # If there are any additional kwargs left, surface them for visibility
+    # (these will be forwarded to ShardedTensor.from_rank_offsets).
+    if kwargs:
+        logger.warning(
+            "make_tp_sharded_tensor_for_checkpoint received extra kwargs: %s", list(kwargs.keys())
+        )
 
     prepend_axis_num = len(prepend_offsets)
 
@@ -997,6 +1003,12 @@ def make_sharded_tensor_for_checkpoint(tensor, key, prepend_offsets=(), replica_
     # Pop group parameters from kwargs
     tp_group = kwargs.pop('tp_group', None)
     dp_cp_group = kwargs.pop('dp_cp_group', None)
+    # If there are any additional kwargs left, surface them for visibility
+    # (these will be forwarded to ShardedTensor.from_rank_offsets).
+    if kwargs:
+        logger.warning(
+            "make_sharded_tensor_for_checkpoint received extra kwargs: %s", list(kwargs.keys())
+        )
 
     prepend_axis_num = len(prepend_offsets)
 
@@ -2066,60 +2078,6 @@ def get_thd_batch_on_this_cp_rank(
     return batch, packed_seq_params
 
 
-################################
-### hybrid context parallel ###
-################################
-
-
-def get_batch_on_this_hybrid_cp_rank(
-    batch: Dict[str, Any],
-    local_cp_size: int,
-    cp_group: Optional[torch.distributed.ProcessGroup] = None,
-):
-    """Slice batch input along sequence dimension into multiple chunks,
-    which are parallelized across GPUs in a context parallel group.
-    """
-    assert local_cp_size is not None
-    if cp_group is None:
-        # Get the local cp group required for as defined by the HybridCPDataLoaderWrapper
-        if local_cp_size > 1:
-            cp_group = parallel_state.get_hybrid_data_context_parallel_groups(
-                group_size=local_cp_size
-            )
-    else:
-        # If cp group is provided, it must match the local cp size
-        # as defined by the HybridCPDataLoaderWrapper
-        assert cp_group.size() == local_cp_size
-
-    # Convert [seqlen] to [1, seqlen] similar to default collate_fn
-    # as hybrid_context_parallel dataloader wrapper does not go through default collate_fn
-    for key, data in batch.items():
-        if key in ['attention_mask']:
-            continue
-        batch[key] = torch.stack([data], 0)
-    sample_length = batch['tokens'].shape[1]
-    # TODO(pmannan): Take care of padding tokens here if not divisible by cp_size*2
-    # Create packed_seq_params for SBHD format with cp group information.
-    packed_seq_params = PackedSeqParams(
-        qkv_format="sbhd",
-        cu_seqlens_q=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
-        cu_seqlens_kv=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
-        cu_seqlens_q_padded=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
-        cu_seqlens_kv_padded=torch.tensor([0, sample_length], device="cuda", pin_memory=True),
-        max_seqlen_q=sample_length,
-        max_seqlen_kv=sample_length,
-        local_cp_size=local_cp_size,
-        cp_group=cp_group,
-    )
-
-    if cp_group is not None and cp_group.size() > 1:
-        # When using hybrid_context_parallel, each sub-sample of a packed sample is
-        # required to be divisible by CP*DP*2 or CP*DP*TP*2 (if using sequence parallel)
-        batch = get_batch_on_this_cp_rank(batch, cp_group=cp_group)
-
-    return batch, packed_seq_params
-
-
 ######################
 ### NVTX profiling ###
 ######################
diff --git a/megatron/inference/utils.py b/megatron/inference/utils.py
index a1204db487a..374a4a1b05f 100644
--- a/megatron/inference/utils.py
+++ b/megatron/inference/utils.py
@@ -4,6 +4,7 @@
 from argparse import ArgumentParser
 from functools import partial
 from typing import Optional
+
 import torch
 
 from gpt_builders import gpt_builder
@@ -74,8 +75,7 @@ def get_model_for_inference() -> MegatronModule:
             quant_backend = "triton"
         elif backend == "te":
             raise ValueError(
-                "MXFP8 quantization is not supported with "
-                "inference_grouped_gemm_backend='te'."
+                "MXFP8 quantization is not supported with " "inference_grouped_gemm_backend='te'."
             )
         quantize_model_to_mxfp8(unwrap_model(model), backend=quant_backend)
     return model
@@ -354,10 +354,18 @@ def get_inference_config_from_model_and_args(model: MegatronModule, args):
         track_paused_request_events=args.inference_dynamic_batching_track_paused_request_events,
         enable_chunked_prefill=args.enable_chunked_prefill,
         enable_prefix_caching=args.inference_dynamic_batching_enable_prefix_caching,
-        prefix_caching_eviction_policy=PrefixCachingEvictionPolicy(args.inference_dynamic_batching_prefix_caching_eviction_policy),
-        prefix_caching_coordinator_policy=PrefixCachingCoordinatorPolicy(args.inference_dynamic_batching_prefix_caching_coordinator_policy),
-        prefix_caching_routing_alpha=getattr(args, 'inference_dynamic_batching_prefix_caching_routing_alpha', 0.5),
-        prefix_caching_mamba_gb=getattr(args, 'inference_dynamic_batching_prefix_caching_mamba_gb', None),
+        prefix_caching_eviction_policy=PrefixCachingEvictionPolicy(
+            args.inference_dynamic_batching_prefix_caching_eviction_policy
+        ),
+        prefix_caching_coordinator_policy=PrefixCachingCoordinatorPolicy(
+            args.inference_dynamic_batching_prefix_caching_coordinator_policy
+        ),
+        prefix_caching_routing_alpha=getattr(
+            args, 'inference_dynamic_batching_prefix_caching_routing_alpha', 0.5
+        ),
+        prefix_caching_mamba_gb=getattr(
+            args, 'inference_dynamic_batching_prefix_caching_mamba_gb', None
+        ),
         metrics_writer=metrics_writer,
         logging_step_interval=args.inference_logging_step_interval,
         num_speculative_tokens=args.num_speculative_tokens,
diff --git a/megatron/legacy/model/__init__.py b/megatron/legacy/model/__init__.py
index 696d5247e0e..979d93892b4 100644
--- a/megatron/legacy/model/__init__.py
+++ b/megatron/legacy/model/__init__.py
@@ -2,6 +2,5 @@
 
 from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
 from .rms_norm import RMSNorm
-
 from .gpt_model import GPTModel
 from .language_model import get_language_model
diff --git a/megatron/legacy/model/transformer.py b/megatron/legacy/model/transformer.py
index a2a85d3747d..55091605a15 100644
--- a/megatron/legacy/model/transformer.py
+++ b/megatron/legacy/model/transformer.py
@@ -13,12 +13,8 @@
 from megatron import core
 from megatron.core import mpu, tensor_parallel
 from megatron.core.enums import ModelType
-from megatron.core.utils import deprecate_inference_params
-from megatron.legacy.model.enums import AttnMaskType, LayerType, AttnType
-from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
-from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl
-from megatron.core.models.common.embeddings import apply_rotary_pos_emb
 from megatron.core.jit import jit_fuser
+from megatron.core.models.common.embeddings import apply_rotary_pos_emb
 from megatron.core.num_microbatches_calculator import get_num_microbatches
 from megatron.core.parallel_state import (
     get_expert_tensor_and_model_parallel_group,
@@ -26,19 +22,15 @@
 )
 from megatron.core.tensor_parallel import (
     gather_from_sequence_parallel_region,
-    reduce_scatter_to_sequence_parallel_region,
     get_cuda_rng_tracker,
     get_data_parallel_rng_tracker_name,
+    reduce_scatter_to_sequence_parallel_region,
 )
+from megatron.core.utils import deprecate_inference_params
 from megatron.legacy.model.enums import AttnMaskType, AttnType, LayerType
 from megatron.legacy.model.fused_bias_gelu import bias_gelu_impl
 from megatron.legacy.model.fused_softmax import FusedScaleMaskSoftmax
-from megatron.legacy.model.utils import (
-    attention_mask_func,
-    erf_gelu,
-    get_norm,
-    openai_gelu,
-)
+from megatron.legacy.model.utils import attention_mask_func, erf_gelu, get_norm, openai_gelu
 from megatron.training import get_args, get_timers
 
 from .module import MegatronModule
@@ -73,28 +65,31 @@
         hyperparameters: transformer hyperparameters
 """
 
+
 class DropPath(MegatronModule):
     """Drop paths (Stochastic Depth) per sample
     (when applied in main path of residual blocks).
     """
 
-    def __init__(self, drop_prob=0.):
+    def __init__(self, drop_prob=0.0):
         super(DropPath, self).__init__()
         self.drop_prob = drop_prob
 
     def forward(self, hidden_state):
-        if self.drop_prob == 0. or not self.training:
+        if self.drop_prob == 0.0 or not self.training:
             return hidden_state
         keep_prob = 1 - self.drop_prob
         # work with diff dim tensors, not just 2D ConvNets
         # hidden_state: [s, b, h]
         shape = (1,) + (hidden_state.shape[1],) + (1,) * (hidden_state.ndim - 2)
-        random_tensor = keep_prob + \
-            torch.rand(shape, dtype=hidden_state.dtype, device=hidden_state.device)
+        random_tensor = keep_prob + torch.rand(
+            shape, dtype=hidden_state.dtype, device=hidden_state.device
+        )
         random_tensor.floor_()  # binarize
         output = hidden_state.div(keep_prob) * random_tensor
         return output
 
+
 class ParallelMLP(MegatronModule):
     """MLP.
 
@@ -134,13 +129,17 @@ def __init__(self, config, is_expert=False):
         elif args.onnx_safe:
             self.activation_func = erf_gelu
         elif args.swiglu:
+
             def swiglu(x):
                 x = torch.chunk(x, 2, dim=-1)
                 return F.silu(x[0]) * x[1]
+
             self.activation_func = swiglu
         elif args.squared_relu:
+
             def squared_relu(x):
                 return torch.pow(F.relu(x), 2)
+
             self.activation_func = squared_relu
         else:
             self.bias_gelu_fusion = args.bias_gelu_fusion
@@ -176,6 +175,7 @@ def forward(self, hidden_states):
         output, output_bias = self.dense_4h_to_h(intermediate_parallel)
         return output, output_bias
 
+
 def sinkhorn(cost, tol=0.0001):
     cost = torch.exp(cost)
     d0 = torch.ones(cost.size(0), device=cost.device, dtype=cost.dtype)
@@ -185,11 +185,11 @@ def sinkhorn(cost, tol=0.0001):
     error = 1e9
     d1_old = d1
     while error > tol:
-        d0 = (1/d0.size(0))*1/(torch.sum(d1*cost,1) + eps)
-        d1 = (1/d1.size(0))*1/(torch.sum(d0.unsqueeze(1)*cost,0)+eps)
-        error = torch.mean(torch.abs(d1_old-d1))
+        d0 = (1 / d0.size(0)) * 1 / (torch.sum(d1 * cost, 1) + eps)
+        d1 = (1 / d1.size(0)) * 1 / (torch.sum(d0.unsqueeze(1) * cost, 0) + eps)
+        error = torch.mean(torch.abs(d1_old - d1))
         d1_old = d1
-    return d1*cost*d0.unsqueeze(1)
+    return d1 * cost * d0.unsqueeze(1)
 
 
 def get_router_linear_layer(config):
@@ -197,7 +197,7 @@ def get_router_linear_layer(config):
     router = torch.nn.Linear(args.hidden_size, args.num_experts, bias=False)
     with get_cuda_rng_tracker().fork(get_data_parallel_rng_tracker_name()):
         config.init_method(router.weight)
-    setattr(router.weight, 'sequence_parallel',config.sequence_parallel)
+    setattr(router.weight, 'sequence_parallel', config.sequence_parallel)
     return router
 
 
@@ -205,6 +205,7 @@ class SwitchMLP(MegatronModule):
     """
     Routes input to one of N MLP "experts"
     """
+
     def __init__(self, config):
         super(SwitchMLP, self).__init__()
         args = get_args()
@@ -216,7 +217,9 @@ def __init__(self, config):
         assert args.num_experts % self.expert_parallel_size == 0
         self.num_local_experts = args.num_experts // self.expert_parallel_size
         local_expert_indices_offset = mpu.get_expert_model_parallel_rank() * self.num_local_experts
-        self.local_expert_indices = [local_expert_indices_offset + i for i in range(self.num_local_experts)]
+        self.local_expert_indices = [
+            local_expert_indices_offset + i for i in range(self.num_local_experts)
+        ]
 
         self.local_experts = torch.nn.ModuleList()
         for i in range(self.num_local_experts):
@@ -225,7 +228,7 @@ def __init__(self, config):
         self.tp_ep_group = get_expert_tensor_and_model_parallel_group()
 
     def gather_indices(self, local_indices):
-        """ Gather tensors and concatinate along the first dimension."""
+        """Gather tensors and concatinate along the first dimension."""
         world_size = self.tp_ep_group.size()
         # Bypass the function if we are using only 1 GPU.
         if world_size == 1:
@@ -235,8 +238,9 @@ def gather_indices(self, local_indices):
         dim_size[0] = dim_size[0] * world_size
 
         # TODO pre allocate memory
-        output = torch.empty(dim_size, dtype=local_indices.dtype,
-                             device=torch.cuda.current_device())
+        output = torch.empty(
+            dim_size, dtype=local_indices.dtype, device=torch.cuda.current_device()
+        )
         torch.distributed._all_gather_base(
             output, local_indices.contiguous(), group=self.tp_ep_group
         )
@@ -270,8 +274,9 @@ def forward(self, hidden_states):
         # Converting [s, b, h] to [s*b, h].
         # Each vector could be routed differently
         if self.sequence_parallel or (self.expert_parallel_size > 1):
-            global_hidden_states = \
-                gather_from_sequence_parallel_region(hidden_states, group=self.tp_ep_group)
+            global_hidden_states = gather_from_sequence_parallel_region(
+                hidden_states, group=self.tp_ep_group
+            )
             global_indices = self.gather_indices(max_ind)
         else:
             global_hidden_states = hidden_states
@@ -292,21 +297,22 @@ def forward(self, hidden_states):
                 output_bias_total[local_indices, :] = output_bias
 
         if self.sequence_parallel or (self.expert_parallel_size > 1):
-            output_total = \
-                reduce_scatter_to_sequence_parallel_region(output_total, group=self.tp_ep_group)
+            output_total = reduce_scatter_to_sequence_parallel_region(
+                output_total, group=self.tp_ep_group
+            )
             if self.add_bias:
-                output_bias_total = \
-                    reduce_scatter_to_sequence_parallel_region(output_bias_total, group=self.tp_ep_group)
+                output_bias_total = reduce_scatter_to_sequence_parallel_region(
+                    output_bias_total, group=self.tp_ep_group
+                )
 
                 # bias is duplicated across tensor parallelism ranks;
                 # reduce scatter reduces bias across tensor parallel_ranks
-                output_bias_total = \
-                    output_bias_total/mpu.get_tensor_model_parallel_world_size()
+                output_bias_total = output_bias_total / mpu.get_tensor_model_parallel_world_size()
 
-        output_total = output_total*max_prob
+        output_total = output_total * max_prob
         output_total = output_total.view(s, b, h)
         if self.add_bias:
-            output_bias_total = output_bias_total*max_prob
+            output_bias_total = output_bias_total * max_prob
             output_bias_total = output_bias_total.view(s, b, h)
         else:
             output_bias_total = None
@@ -316,8 +322,7 @@ def forward(self, hidden_states):
 
 class CoreAttention(MegatronModule):
 
-    def __init__(self, layer_number, config,
-                 attn_mask_type=AttnMaskType.padding):
+    def __init__(self, layer_number, config, attn_mask_type=AttnMaskType.padding):
         super(CoreAttention, self).__init__()
         self.fp16 = config.fp16
         self.bf16 = config.bf16
@@ -334,12 +339,13 @@ def __init__(self, layer_number, config,
 
         # Per attention head and per partition values.
         world_size = mpu.get_tensor_model_parallel_world_size()
-        self.hidden_size_per_partition = core.utils.divide(projection_size,
-                                                           world_size)
+        self.hidden_size_per_partition = core.utils.divide(projection_size, world_size)
         self.hidden_size_per_attention_head = core.utils.divide(
-            projection_size, config.num_attention_heads)
+            projection_size, config.num_attention_heads
+        )
         self.num_attention_heads_per_partition = core.utils.divide(
-            config.num_attention_heads, world_size)
+            config.num_attention_heads, world_size
+        )
 
         coeff = None
         self.norm_factor = math.sqrt(self.hidden_size_per_attention_head)
@@ -348,49 +354,54 @@ def __init__(self, layer_number, config,
             self.norm_factor *= coeff
 
         self.scale_mask_softmax = FusedScaleMaskSoftmax(
-            self.fp16, self.bf16,
+            self.fp16,
+            self.bf16,
             self.attn_mask_type,
             config.masked_softmax_fusion,
             attention_mask_func,
             self.attention_softmax_in_fp32,
-            coeff)
+            coeff,
+        )
 
         # Dropout. Note that for a single iteration, this layer will generate
         # different outputs on different number of parallel partitions but
         # on average it should not be partition dependent.
         self.attention_dropout = torch.nn.Dropout(config.attention_dropout)
 
-    def forward(self, query_layer, key_layer,
-                value_layer, attention_mask):
+    def forward(self, query_layer, key_layer, value_layer, attention_mask):
 
         # ===================================
         # Raw attention scores. [b, np, s, s]
         # ===================================
 
         # [b, np, sq, sk]
-        output_size = (query_layer.size(1),
-                       query_layer.size(2),
-                       query_layer.size(0),
-                       key_layer.size(0))
+        output_size = (
+            query_layer.size(1),
+            query_layer.size(2),
+            query_layer.size(0),
+            key_layer.size(0),
+        )
 
         # [sq, b, np, hn] -> [sq, b * np, hn]
-        query_layer = query_layer.reshape(output_size[2],
-                                          output_size[0] * output_size[1], -1)
+        query_layer = query_layer.reshape(output_size[2], output_size[0] * output_size[1], -1)
         # [sk, b, np, hn] -> [sk, b * np, hn]
-        key_layer = key_layer.view(output_size[3],
-                                   output_size[0] * output_size[1], -1)
+        key_layer = key_layer.view(output_size[3], output_size[0] * output_size[1], -1)
 
         # preallocting input tensor: [b * np, sq, sk]
         matmul_input_buffer = mpu.get_global_memory_buffer().get_tensor(
-            (output_size[0]*output_size[1], output_size[2], output_size[3]),
-            query_layer.dtype, "mpu")
+            (output_size[0] * output_size[1], output_size[2], output_size[3]),
+            query_layer.dtype,
+            "mpu",
+        )
 
         # Raw attention scores. [b * np, sq, sk]
         matmul_result = torch.baddbmm(
             matmul_input_buffer,
-            query_layer.transpose(0, 1),   # [b * np, sq, hn]
+            query_layer.transpose(0, 1),  # [b * np, sq, hn]
             key_layer.transpose(0, 1).transpose(1, 2),  # [b * np, hn, sk]
-            beta=0.0, alpha=(1.0/self.norm_factor))
+            beta=0.0,
+            alpha=(1.0 / self.norm_factor),
+        )
 
         # change view to [b, np, sq, sk]
         attention_scores = matmul_result.view(*output_size)
@@ -400,8 +411,7 @@ def forward(self, query_layer, key_layer,
         # ===========================
 
         # attention scores and attention mask [b, np, sq, sk]
-        attention_probs = self.scale_mask_softmax(attention_scores,
-                                                  attention_mask)
+        attention_probs = self.scale_mask_softmax(attention_scores, attention_mask)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
@@ -419,18 +429,18 @@ def forward(self, query_layer, key_layer,
         # [sk, b, np, hn] --> [b, np, sq, hn]
 
         # context layer shape: [b, np, sq, hn]
-        output_size = (value_layer.size(1),
-                       value_layer.size(2),
-                       query_layer.size(0),
-                       value_layer.size(3))
+        output_size = (
+            value_layer.size(1),
+            value_layer.size(2),
+            query_layer.size(0),
+            value_layer.size(3),
+        )
 
         # change view [sk, b * np, hn]
-        value_layer = value_layer.view(value_layer.size(0),
-                                       output_size[0] * output_size[1], -1)
+        value_layer = value_layer.view(value_layer.size(0), output_size[0] * output_size[1], -1)
 
         # change view [b * np, sq, sk]
-        attention_probs = attention_probs.view(output_size[0] * output_size[1],
-                                               output_size[2], -1)
+        attention_probs = attention_probs.view(output_size[0] * output_size[1], output_size[2], -1)
 
         # matmul: [b * np, sq, hn]
         context_layer = torch.bmm(attention_probs, value_layer.transpose(0, 1))
@@ -442,8 +452,7 @@ def forward(self, query_layer, key_layer,
         context_layer = context_layer.permute(2, 0, 1, 3).contiguous()
 
         # [sq, b, np, hn] --> [sq, b, hp]
-        new_context_layer_shape = context_layer.size()[:-2] + \
-            (self.hidden_size_per_partition,)
+        new_context_layer_shape = context_layer.size()[:-2] + (self.hidden_size_per_partition,)
         context_layer = context_layer.view(*new_context_layer_shape)
 
         return context_layer
@@ -459,11 +468,14 @@ class FlashSelfAttention(torch.nn.Module):
         attention_dropout: The dropout rate to apply to the attention
                            (default: 0.0)
     """
-    def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0,
-                 device=None, dtype=None):
+
+    def __init__(
+        self, causal=False, softmax_scale=None, attention_dropout=0.0, device=None, dtype=None
+    ):
         super().__init__()
-        assert flash_attn_unpadded_func is not None, ('Please install FlashAttention first, '
-                                                      'e.g., with pip install flash-attn')
+        assert flash_attn_unpadded_func is not None, (
+            'Please install FlashAttention first, ' 'e.g., with pip install flash-attn'
+        )
         assert rearrange is not None, 'Please install einops first, e.g., with pip install einops'
         self.causal = causal
         self.softmax_scale = softmax_scale
@@ -476,15 +488,16 @@ def forward(self, q, k, v):
             q, k, v: The tensor containing the query, key, and value. (B, S, H, D)
         """
 
-        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q,k,v)))
-        assert all((i.is_cuda for i in (q,k,v)))
+        assert all((i.dtype in [torch.float16, torch.bfloat16] for i in (q, k, v)))
+        assert all((i.is_cuda for i in (q, k, v)))
 
         batch_size, seqlen_q = q.shape[0], q.shape[1]
         seqlen_k = k.shape[1]
 
         q, k, v = [rearrange(x, 'b s ... -> (b s) ...') for x in [q, k, v]]
-        cu_seqlens_q = torch.arange(0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32,
-                                    device=q.device)
+        cu_seqlens_q = torch.arange(
+            0, (batch_size + 1) * seqlen_q, step=seqlen_q, dtype=torch.int32, device=q.device
+        )
 
         if self.training:
             # during training q,k,v always have same seqlen
@@ -497,14 +510,22 @@ def forward(self, q, k, v):
             # turn off FA causal mask after first inference autoregressive iteration
             # only on first autoregressive step q,k,v have same seqlen
             is_causal = seqlen_q == seqlen_k
-            cu_seqlens_k = torch.arange(0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32,
-                        device=q.device)
+            cu_seqlens_k = torch.arange(
+                0, (batch_size + 1) * seqlen_k, step=seqlen_k, dtype=torch.int32, device=q.device
+            )
             dropout_p = 0
 
         output = flash_attn_unpadded_func(
-            q, k, v, cu_seqlens_q, cu_seqlens_k, seqlen_q, seqlen_k,
+            q,
+            k,
+            v,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            seqlen_q,
+            seqlen_k,
             dropout_p,
-            softmax_scale=self.softmax_scale, causal=is_causal
+            softmax_scale=self.softmax_scale,
+            causal=is_causal,
         )
 
         output = rearrange(output, '(b s) ... -> b s ...', b=batch_size)
@@ -518,9 +539,13 @@ class ParallelAttention(MegatronModule):
     and returns output of the same size.
     """
 
-    def __init__(self, config, layer_number,
-                 attention_type=AttnType.self_attn,
-                 attn_mask_type=AttnMaskType.padding):
+    def __init__(
+        self,
+        config,
+        layer_number,
+        attention_type=AttnType.self_attn,
+        attn_mask_type=AttnMaskType.padding,
+    ):
         super(ParallelAttention, self).__init__()
         args = get_args()
         self.layer_number = max(1, layer_number)
@@ -538,33 +563,43 @@ def __init__(self, config, layer_number,
         else:
             kv_projection_size = args.kv_channels * args.num_attention_heads
 
-        self.use_flash_attn = args.use_flash_attn \
-            and attention_type == AttnType.self_attn \
+        self.use_flash_attn = (
+            args.use_flash_attn
+            and attention_type == AttnType.self_attn
             and self.attn_mask_type == AttnMaskType.causal
+        )
         if self.use_flash_attn:
             if flash_attn_unpadded_func is None:
-                raise ImportError('FlashAttention is not installed, please install with '
-                                  'pip install flash-attn')
-            assert attention_type == AttnType.self_attn, ('FlashAttention code path only supports '
-                                                          'self-attention for now')
-            assert self.attn_mask_type == AttnMaskType.causal, ('FlashAttention code path only '
-                                                                'supports causal mask for now')
+                raise ImportError(
+                    'FlashAttention is not installed, please install with ' 'pip install flash-attn'
+                )
+            assert attention_type == AttnType.self_attn, (
+                'FlashAttention code path only supports ' 'self-attention for now'
+            )
+            assert self.attn_mask_type == AttnMaskType.causal, (
+                'FlashAttention code path only ' 'supports causal mask for now'
+            )
             if rearrange is None:
                 raise ImportError('einops is not installed, please install with pip install einops')
 
         # Per attention head and per partition values.
         world_size = mpu.get_tensor_model_parallel_world_size()
         self.hidden_size_per_attention_head = core.utils.divide(
-            query_projection_size, config.num_attention_heads)
+            query_projection_size, config.num_attention_heads
+        )
         self.num_attention_heads_per_partition = core.utils.divide(
-            config.num_attention_heads, world_size)
+            config.num_attention_heads, world_size
+        )
 
         if self.group_query_attention:
             if args.num_query_groups % world_size != 0:
-                raise NotImplementedError('Currently the num_query_groups should be '
-                                          'a multiple of the tensor parallel size')
+                raise NotImplementedError(
+                    'Currently the num_query_groups should be '
+                    'a multiple of the tensor parallel size'
+                )
             self.num_query_groups_per_partition = core.utils.divide(
-                        args.num_query_groups, world_size)
+                args.num_query_groups, world_size
+            )
         else:
             self.num_query_groups_per_partition = self.num_attention_heads_per_partition
 
@@ -576,12 +611,15 @@ def __init__(self, config, layer_number,
                 config=config,
                 init_method=config.init_method,
                 bias=args.add_bias_linear or args.add_qkv_bias,
-                gather_output=False)
+                gather_output=False,
+            )
         else:
             assert attention_type == AttnType.cross_attn
 
             if self.group_query_attention:
-                raise NotImplementedError("Grouped query attention not implemented for cross-attention.")
+                raise NotImplementedError(
+                    "Grouped query attention not implemented for cross-attention."
+                )
             assert query_projection_size == kv_projection_size
 
             self.query = tensor_parallel.ColumnParallelLinear(
@@ -590,7 +628,8 @@ def __init__(self, config, layer_number,
                 config=config,
                 init_method=config.init_method,
                 bias=config.add_bias_linear,
-                gather_output=False)
+                gather_output=False,
+            )
 
             self.key_value = tensor_parallel.ColumnParallelLinear(
                 config.hidden_size,
@@ -598,13 +637,12 @@ def __init__(self, config, layer_number,
                 config=config,
                 init_method=config.init_method,
                 bias=config.add_bias_linear,
-                gather_output=False)
+                gather_output=False,
+            )
 
-        self.core_attention = CoreAttention(self.layer_number, config,
-                                            self.attn_mask_type)
+        self.core_attention = CoreAttention(self.layer_number, config, self.attn_mask_type)
         self.checkpoint_core_attention = (
-            config.recompute_granularity == 'selective'
-            and "core_attn" in config.recompute_modules
+            config.recompute_granularity == 'selective' and "core_attn" in config.recompute_modules
         )
 
         if self.use_flash_attn:
@@ -620,28 +658,34 @@ def __init__(self, config, layer_number,
             init_method=config.output_layer_init_method,
             bias=args.add_bias_linear,
             input_is_parallel=True,
-            skip_bias_add=True)
+            skip_bias_add=True,
+        )
 
-    def _checkpointed_attention_forward(self, query_layer, key_layer,
-                                        value_layer, attention_mask,
-                                        rotary_pos_emb=None):
+    def _checkpointed_attention_forward(
+        self, query_layer, key_layer, value_layer, attention_mask, rotary_pos_emb=None
+    ):
         """Forward method with activation checkpointing."""
+
         def custom_forward(*inputs):
             query_layer = inputs[0]
             key_layer = inputs[1]
             value_layer = inputs[2]
             attention_mask = inputs[3]
-            output_ = self.core_attention(query_layer, key_layer,
-                                          value_layer, attention_mask)
+            output_ = self.core_attention(query_layer, key_layer, value_layer, attention_mask)
             return output_
 
-        q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None \
-            else rotary_pos_emb
+        q_pos_emb, k_pos_emb = (None, None) if rotary_pos_emb is None else rotary_pos_emb
 
         hidden_states = tensor_parallel.checkpoint(
             custom_forward,
-            False, query_layer, key_layer, value_layer, attention_mask,
-            q_pos_emb, k_pos_emb)
+            False,
+            query_layer,
+            key_layer,
+            value_layer,
+            attention_mask,
+            q_pos_emb,
+            k_pos_emb,
+        )
 
         return hidden_states
 
@@ -652,11 +696,19 @@ def _allocate_memory(self, inference_max_sequence_len, batch_size, num_attention
             num_attention_heads,
             self.hidden_size_per_attention_head,
             dtype=self.params_dtype,
-            device=torch.cuda.current_device())
+            device=torch.cuda.current_device(),
+        )
 
-    def forward(self, hidden_states, attention_mask,
-                encoder_output=None, inference_context=None,
-                rotary_pos_emb=None, *, inference_params=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_output=None,
+        inference_context=None,
+        rotary_pos_emb=None,
+        *,
+        inference_params=None,
+    ):
         # hidden_states: [sq, b, h]
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
@@ -670,18 +722,21 @@ def forward(self, hidden_states, attention_mask,
                 inf_max_seq_len = inference_context.max_sequence_length
                 inf_max_batch_size = inference_context.max_batch_size
                 inference_key_memory = self._allocate_memory(
-                    inf_max_seq_len, inf_max_batch_size,
-                    self.num_query_groups_per_partition)
+                    inf_max_seq_len, inf_max_batch_size, self.num_query_groups_per_partition
+                )
                 inference_value_memory = self._allocate_memory(
-                    inf_max_seq_len, inf_max_batch_size,
-                    self.num_query_groups_per_partition)
+                    inf_max_seq_len, inf_max_batch_size, self.num_query_groups_per_partition
+                )
 
                 inference_context.key_value_memory_dict[self.layer_number] = (
-                    inference_key_memory, inference_value_memory)
+                    inference_key_memory,
+                    inference_value_memory,
+                )
                 is_first_step = True
             else:
-                inference_key_memory, inference_value_memory = \
+                inference_key_memory, inference_value_memory = (
                     inference_context.key_value_memory_dict[self.layer_number]
+                )
 
         # =====================
         # Query, Key, and Value
@@ -695,49 +750,58 @@ def forward(self, hidden_states, attention_mask,
             new_tensor_shape = mixed_x_layer.size()[:-1] + (
                 self.num_query_groups_per_partition,
                 (
-                    (self.num_attention_heads_per_partition // self.num_query_groups_per_partition + 2)
+                    (
+                        self.num_attention_heads_per_partition
+                        // self.num_query_groups_per_partition
+                        + 2
+                    )
                     * self.hidden_size_per_attention_head
                 ),
             )
             mixed_x_layer = mixed_x_layer.view(*new_tensor_shape)
 
             # [sq, b, ng, (np/ng + 2) * hn] --> [sq, b, ng, np/ng * hn], [sq, b, ng, hn], [sq, b, ng, hn]
-            (query_layer,
-            key_layer,
-            value_layer) = torch.split(
+            (query_layer, key_layer, value_layer) = torch.split(
                 mixed_x_layer,
                 [
                     (
-                        self.num_attention_heads_per_partition // self.num_query_groups_per_partition
+                        self.num_attention_heads_per_partition
+                        // self.num_query_groups_per_partition
                         * self.hidden_size_per_attention_head
                     ),
                     self.hidden_size_per_attention_head,
-                    self.hidden_size_per_attention_head
+                    self.hidden_size_per_attention_head,
                 ],
-                dim=3)
+                dim=3,
+            )
 
             # [sq, b, ng, np/ng * hn] -> [sq, b, np, hn] -
-            query_layer = query_layer.view(query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head)
+            query_layer = query_layer.view(
+                query_layer.size(0), query_layer.size(1), -1, self.hidden_size_per_attention_head
+            )
         else:
             # Attention heads [sk, b, h] --> [sk, b, (np * 2 * hn)]
             mixed_kv_layer, _ = self.key_value(encoder_output)
 
             # [sk, b, (np * 2 * hn)] --> [sk, b, np, 2 * hn]
-            new_tensor_shape = mixed_kv_layer.size()[:-1] + \
-                (self.num_attention_heads_per_partition,
-                2 * self.hidden_size_per_attention_head)
+            new_tensor_shape = mixed_kv_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                2 * self.hidden_size_per_attention_head,
+            )
             mixed_kv_layer = mixed_kv_layer.view(*new_tensor_shape)
 
             # [sk, b, np, 2 * hn] --> 2 [sk, b, np, hn]
-            (key_layer,
-            value_layer) = tensor_parallel.split_tensor_along_last_dim(mixed_kv_layer, 2)
+            (key_layer, value_layer) = tensor_parallel.split_tensor_along_last_dim(
+                mixed_kv_layer, 2
+            )
 
             # Attention head [sq, b, h] --> [sq, b, hp]
             query_layer, _ = self.query(hidden_states)
             # [sq, b, hp] --> [sq, b, np, hn]
-            new_tensor_shape = query_layer.size()[:-1] + \
-                (self.num_attention_heads_per_partition,
-                self.hidden_size_per_attention_head)
+            new_tensor_shape = query_layer.size()[:-1] + (
+                self.num_attention_heads_per_partition,
+                self.hidden_size_per_attention_head,
+            )
             query_layer = query_layer.view(*new_tensor_shape)
 
         # ==================================
@@ -749,7 +813,7 @@ def forward(self, hidden_states, attention_mask,
             if isinstance(rotary_pos_emb, tuple):
                 rotary_pos_emb = rotary_pos_emb
             else:
-                rotary_pos_emb = ((rotary_pos_emb,) * 2)
+                rotary_pos_emb = (rotary_pos_emb,) * 2
 
         if inference_context:
             batch_start = inference_context.batch_size_offset
@@ -757,18 +821,19 @@ def forward(self, hidden_states, attention_mask,
             assert batch_end <= inference_key_memory.size(1)
             sequence_start = inference_context.sequence_len_offset
             sequence_end = sequence_start + key_layer.size(0)
-            assert sequence_end <= inference_key_memory.size(0), ("Current sequence length is "
-            "longer than expected maximum sequence length! Increase inference_max_seq_length.")
+            assert sequence_end <= inference_key_memory.size(0), (
+                "Current sequence length is "
+                "longer than expected maximum sequence length! Increase inference_max_seq_length."
+            )
             # Copy key and values.
-            inference_key_memory[sequence_start:sequence_end,
-                                 batch_start:batch_end, ...] = key_layer
-            inference_value_memory[sequence_start:sequence_end,
-                                   batch_start:batch_end, ...] = value_layer
-            key_layer = inference_key_memory[
-                :sequence_end, batch_start:batch_end, ...]
-            value_layer = inference_value_memory[
-                :sequence_end, batch_start:batch_end, ...]
-
+            inference_key_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = (
+                key_layer
+            )
+            inference_value_memory[sequence_start:sequence_end, batch_start:batch_end, ...] = (
+                value_layer
+            )
+            key_layer = inference_key_memory[:sequence_end, batch_start:batch_end, ...]
+            value_layer = inference_value_memory[:sequence_end, batch_start:batch_end, ...]
 
             # adjust the key rotary positional embedding
             if rotary_pos_emb is not None:
@@ -797,19 +862,17 @@ def forward(self, hidden_states, attention_mask,
         # expand the key_layer and value_layer [sk, b, ng, hn] -> [sk, b, np, hn]
         if self.num_attention_heads_per_partition // self.num_query_groups_per_partition > 1:
             key_layer = key_layer.repeat_interleave(
-                self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
-                dim = 2
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
             )
             value_layer = value_layer.repeat_interleave(
-                self.num_attention_heads_per_partition // self.num_query_groups_per_partition,
-                dim = 2
+                self.num_attention_heads_per_partition // self.num_query_groups_per_partition, dim=2
             )
 
         # apply relative positional encoding (rotary embedding)
         if rotary_pos_emb is not None:
             q_pos_emb, k_pos_emb = rotary_pos_emb
-            query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb,self.config)
-            key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb,self.config)
+            query_layer = apply_rotary_pos_emb(query_layer, q_pos_emb, self.config)
+            key_layer = apply_rotary_pos_emb(key_layer, k_pos_emb, self.config)
             # TODO, can apply positional embedding to value_layer so it has
             # absolute positional embedding.
             # otherwise, only relative positional embedding takes effect
@@ -818,13 +881,17 @@ def forward(self, hidden_states, attention_mask,
         if not self.use_flash_attn:
             if self.checkpoint_core_attention:
                 context_layer = self._checkpointed_attention_forward(
-                    query_layer, key_layer, value_layer, attention_mask)
+                    query_layer, key_layer, value_layer, attention_mask
+                )
             else:
                 context_layer = self.core_attention(
-                    query_layer, key_layer, value_layer, attention_mask)
+                    query_layer, key_layer, value_layer, attention_mask
+                )
         else:
-            q, k, v = [rearrange(x, 's b ... -> b s ...').contiguous()
-                       for x in (query_layer, key_layer, value_layer)]
+            q, k, v = [
+                rearrange(x, 's b ... -> b s ...').contiguous()
+                for x in (query_layer, key_layer, value_layer)
+            ]
             if not self.sequence_parallel:
                 with tensor_parallel.get_cuda_rng_tracker().fork():
                     context_layer = self.core_attention_flash(q, k, v)
@@ -853,22 +920,21 @@ def bias_dropout_add(x, bias, residual, prob, training):
 def get_bias_dropout_add(training):
     def _bias_dropout_add(x, bias, residual, prob):
         return bias_dropout_add(x, bias, residual, prob, training)
+
     return _bias_dropout_add
 
 
 @jit_fuser
-def bias_dropout_add_fused_train(x: torch.Tensor,
-                                 bias: Optional[torch.Tensor],
-                                 residual: torch.Tensor,
-                                 prob: float) -> torch.Tensor:
+def bias_dropout_add_fused_train(
+    x: torch.Tensor, bias: Optional[torch.Tensor], residual: torch.Tensor, prob: float
+) -> torch.Tensor:
     return bias_dropout_add(x, bias, residual, prob, True)
 
 
 @jit_fuser
-def bias_dropout_add_fused_inference(x: torch.Tensor,
-                                     bias: Optional[torch.Tensor],
-                                     residual: torch.Tensor,
-                                     prob: float) -> torch.Tensor:
+def bias_dropout_add_fused_inference(
+    x: torch.Tensor, bias: Optional[torch.Tensor], residual: torch.Tensor, prob: float
+) -> torch.Tensor:
     return bias_dropout_add(x, bias, residual, prob, False)
 
 
@@ -879,18 +945,21 @@ class ParallelTransformerLayer(MegatronModule):
     output of the same size.
     """
 
-    def __init__(self, config,
-                 layer_number, layer_type=LayerType.encoder,
-                 self_attn_mask_type=AttnMaskType.padding,
-                 drop_path_rate=0.):
+    def __init__(
+        self,
+        config,
+        layer_number,
+        layer_type=LayerType.encoder,
+        self_attn_mask_type=AttnMaskType.padding,
+        drop_path_rate=0.0,
+    ):
         args = get_args()
 
         super(ParallelTransformerLayer, self).__init__()
         self.layer_number = layer_number
         self.layer_type = layer_type
 
-        self.apply_residual_connection_post_norm \
-            = config.apply_residual_connection_post_layernorm
+        self.apply_residual_connection_post_norm = config.apply_residual_connection_post_layernorm
 
         self.bf16 = config.bf16
         self.fp32_residual_connection = config.fp32_residual_connection
@@ -903,7 +972,8 @@ def __init__(self, config,
             config,
             layer_number,
             attention_type=AttnType.self_attn,
-            attn_mask_type=self_attn_mask_type)
+            attn_mask_type=self_attn_mask_type,
+        )
         self.hidden_dropout = config.hidden_dropout
         self.bias_dropout_fusion = config.bias_dropout_fusion
         self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None
@@ -921,22 +991,17 @@ def __init__(self, config,
         TORCH_MAJOR = int(torch.__version__.split('.')[0])
         TORCH_MINOR = int(torch.__version__.split('.')[1])
         use_nvfuser = TORCH_MAJOR > 1 or (TORCH_MAJOR == 1 and TORCH_MINOR >= 10)
-        self.bias_dropout_add_exec_handler = \
-                nullcontext if use_nvfuser else torch.enable_grad
-
-    def default_decoder_cross_attention(self,
-                                        encoder_output,
-                                        enc_dec_attn_mask,
-                                        norm_input,
-                                        norm_output,
-                                        bias_dropout_add_func):
+        self.bias_dropout_add_exec_handler = nullcontext if use_nvfuser else torch.enable_grad
+
+    def default_decoder_cross_attention(
+        self, encoder_output, enc_dec_attn_mask, norm_input, norm_output, bias_dropout_add_func
+    ):
         '''Cross attention for a standard encoder-decoder model.'''
 
         # Attention.
-        attention_output, attention_bias = \
-            self.inter_attention(norm_output,
-                                 enc_dec_attn_mask,
-                                 encoder_output=encoder_output)
+        attention_output, attention_bias = self.inter_attention(
+            norm_output, enc_dec_attn_mask, encoder_output=encoder_output
+        )
 
         # Residual connection.
         if self.apply_residual_connection_post_norm:
@@ -950,22 +1015,25 @@ def default_decoder_cross_attention(self,
         # Bias-dropout-add.
         with self.bias_dropout_add_exec_handler():
             norm_input = bias_dropout_add_func(
-                attention_output,
-                attention_bias,
-                residual,
-                self.hidden_dropout)
+                attention_output, attention_bias, residual, self.hidden_dropout
+            )
 
         # Normalize.
         norm_output = self.post_inter_attention_norm(norm_input)
 
         return norm_input, norm_output
 
-    def forward(self, hidden_states, attention_mask,
-                encoder_output=None, enc_dec_attn_mask=None,
-                inference_context=None,
-                rotary_pos_emb=None,
-                *,
-                inference_params=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_output=None,
+        enc_dec_attn_mask=None,
+        inference_context=None,
+        rotary_pos_emb=None,
+        *,
+        inference_params=None,
+    ):
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
@@ -975,12 +1043,12 @@ def forward(self, hidden_states, attention_mask,
         norm_output = self.input_norm(hidden_states)
 
         # Self attention.
-        attention_output, attention_bias = \
-            self.self_attention(
-                norm_output,
-                attention_mask,
-                inference_context=inference_context,
-                rotary_pos_emb=rotary_pos_emb)
+        attention_output, attention_bias = self.self_attention(
+            norm_output,
+            attention_mask,
+            inference_context=inference_context,
+            rotary_pos_emb=rotary_pos_emb,
+        )
 
         # Residual connection.
         if self.apply_residual_connection_post_norm:
@@ -1005,14 +1073,12 @@ def forward(self, hidden_states, attention_mask,
                 attention_bias = attention_bias.expand_as(residual)
             with self.bias_dropout_add_exec_handler():
                 norm_input = bias_dropout_add_func(
-                    attention_output,
-                    attention_bias,
-                    residual,
-                    self.hidden_dropout)
+                    attention_output, attention_bias, residual, self.hidden_dropout
+                )
         else:
-            out = torch.nn.functional.dropout(attention_output + attention_bias,
-                                              p=self.hidden_dropout,
-                                              training=self.training)
+            out = torch.nn.functional.dropout(
+                attention_output + attention_bias, p=self.hidden_dropout, training=self.training
+            )
             norm_input = residual + self.drop_path(out)
 
         # Layer norm post the self attention.
@@ -1022,16 +1088,11 @@ def forward(self, hidden_states, attention_mask,
         if self.layer_type == LayerType.encoder:
             pass
         elif self.layer_type == LayerType.decoder:
-            norm_input, norm_output = \
-                self.default_decoder_cross_attention(
-                    encoder_output,
-                    enc_dec_attn_mask,
-                    norm_input,
-                    norm_output,
-                    bias_dropout_add_func)
+            norm_input, norm_output = self.default_decoder_cross_attention(
+                encoder_output, enc_dec_attn_mask, norm_input, norm_output, bias_dropout_add_func
+            )
         else:
-            raise Exception("Unsupported layer type, '%s'." %
-                            self.layer_type.name)
+            raise Exception("Unsupported layer type, '%s'." % self.layer_type.name)
 
         # MLP.
         mlp_output, mlp_bias = self.mlp(norm_output)
@@ -1046,11 +1107,7 @@ def forward(self, hidden_states, attention_mask,
             if mlp_bias is not None:
                 mlp_bias = mlp_bias.expand_as(residual)
             with self.bias_dropout_add_exec_handler():
-                output = bias_dropout_add_func(
-                    mlp_output,
-                    mlp_bias,
-                    residual,
-                    self.hidden_dropout)
+                output = bias_dropout_add_func(mlp_output, mlp_bias, residual, self.hidden_dropout)
 
             # Jit compiled function creates 'view' tensor. This tensor
             # potentially gets saved in the MPU checkpoint function context,
@@ -1058,16 +1115,16 @@ def forward(self, hidden_states, attention_mask,
             # won't result in memory savings (like the data loader, or
             # p2p_communication), it serves to document the origin of this
             # 'view' tensor.
-            output = core.utils.make_viewless_tensor(inp = output,
-                                                     requires_grad = output.requires_grad,
-                                                     keep_graph = True)
+            output = core.utils.make_viewless_tensor(
+                inp=output, requires_grad=output.requires_grad, keep_graph=True
+            )
 
         else:
             if mlp_bias is not None:
                 mlp_output = mlp_output + mlp_bias
-            out = torch.nn.functional.dropout(mlp_output,
-                                              p=self.hidden_dropout,
-                                              training=self.training)
+            out = torch.nn.functional.dropout(
+                mlp_output, p=self.hidden_dropout, training=self.training
+            )
             output = residual + self.drop_path(out)
 
         return output
@@ -1093,9 +1150,16 @@ def __init__(self, layer_number):
         super().__init__()
         self.layer_number = layer_number
 
-    def forward(self, hidden_states, attention_mask,
-                encoder_output=None, enc_dec_attn_mask=None,
-                inference_context=None, *, inference_params=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_output=None,
+        enc_dec_attn_mask=None,
+        inference_context=None,
+        *,
+        inference_params=None,
+    ):
         return hidden_states.clone()
 
 
@@ -1103,8 +1167,9 @@ def _get_num_layers(args, model_type, is_decoder=False):
     """Compute the number of transformer layers resident on the current rank."""
     if mpu.get_pipeline_model_parallel_world_size() > 1:
         assert args.num_layers == args.encoder_num_layers
-        assert args.num_layers % args.transformer_pipeline_model_parallel_size == 0, \
-            'num_layers must be divisible by transformer_pipeline_model_parallel_size'
+        assert (
+            args.num_layers % args.transformer_pipeline_model_parallel_size == 0
+        ), 'num_layers must be divisible by transformer_pipeline_model_parallel_size'
 
         # When a standalone embedding stage is used, all transformer layers
         # are divided among pipeline rank >= 1, while on pipeline rank 0,
@@ -1113,8 +1178,8 @@ def _get_num_layers(args, model_type, is_decoder=False):
         num_layers = (
             0
             if args.account_for_embedding_in_pipeline_split
-            and mpu.get_pipeline_model_parallel_rank() == 0 else
-            args.num_layers // args.transformer_pipeline_model_parallel_size
+            and mpu.get_pipeline_model_parallel_rank() == 0
+            else args.num_layers // args.transformer_pipeline_model_parallel_size
         )
     else:
         if not is_decoder:
@@ -1127,13 +1192,17 @@ def _get_num_layers(args, model_type, is_decoder=False):
 class ParallelTransformer(MegatronModule):
     """Transformer class."""
 
-    def __init__(self, config,
-                 model_type, layer_type=LayerType.encoder,
-                 self_attn_mask_type=AttnMaskType.padding,
-                 post_norm=True,
-                 pre_process=True,
-                 post_process=True,
-                 drop_path_rate=0.0):
+    def __init__(
+        self,
+        config,
+        model_type,
+        layer_type=LayerType.encoder,
+        self_attn_mask_type=AttnMaskType.padding,
+        post_norm=True,
+        pre_process=True,
+        post_process=True,
+        drop_path_rate=0.0,
+    ):
         super(ParallelTransformer, self).__init__()
         args = get_args()
 
@@ -1152,8 +1221,9 @@ def __init__(self, config,
         self.recompute_granularity = config.recompute_granularity
         self.recompute_method = config.recompute_method
         self.recompute_num_layers = config.recompute_num_layers
-        self.distribute_saved_activations = \
+        self.distribute_saved_activations = (
             config.distribute_saved_activations and not config.sequence_parallel
+        )
 
         self.sequence_parallel = config.sequence_parallel
 
@@ -1172,15 +1242,17 @@ def __init__(self, config,
             if core.utils.is_te_min_version("0.11.0"):
                 self.transformer_engine_v_0_11 = True
 
-            assert not args.squared_relu, ("TransformerEngine does not support squared "
-                                           "relu activation.")
+            assert not args.squared_relu, (
+                "TransformerEngine does not support squared " "relu activation."
+            )
 
         self.use_fp8 = args.fp8 is not None
         self.fp8_recipe = None
         self.fp8_group = None
         if self.use_fp8:
-            assert args.transformer_impl == 'transformer_engine', \
-                'transformer-engine required for fp8 training and inference'
+            assert (
+                args.transformer_impl == 'transformer_engine'
+            ), 'transformer-engine required for fp8 training and inference'
             self.fp8_group = mpu.get_amax_reduction_group(tp_only_amax_red=config.tp_only_amax_red)
             if args.fp8 == "e4m3":
                 fp8_format = transformer_engine.common.recipe.Format.E4M3
@@ -1200,17 +1272,15 @@ def __init__(self, config,
         self.num_microbatches_in_previous_step = -1
         self.microbatch_count = 0
         self.checkpoint_core_attention = (
-            config.recompute_granularity == 'selective'
-            and "core_attn" in config.recompute_modules
+            config.recompute_granularity == 'selective' and "core_attn" in config.recompute_modules
         )
 
         # Number of layers.
-        self.num_layers = _get_num_layers(args, model_type,
-                                          layer_type==LayerType.decoder)
+        self.num_layers = _get_num_layers(args, model_type, layer_type == LayerType.decoder)
 
         self.drop_path_rates = [
-            rate.item() for rate in
-            torch.linspace(0, self.drop_path_rate, config.num_layers)]
+            rate.item() for rate in torch.linspace(0, self.drop_path_rate, config.num_layers)
+        ]
 
         def build_layer(layer_number):
             if args.transformer_impl == 'local':
@@ -1219,21 +1289,28 @@ def build_layer(layer_number):
                     layer_number,
                     layer_type=layer_type,
                     self_attn_mask_type=self_attn_mask_type,
-                    drop_path_rate=self.drop_path_rates[layer_number - 1])
+                    drop_path_rate=self.drop_path_rates[layer_number - 1],
+                )
             else:
                 # This argument is only available from TE v0.10 onwards.
                 extra_transformer_engine_kwargs = {}
                 if self.transformer_engine_v_0_8:
                     extra_transformer_engine_kwargs["bias"] = args.add_bias_linear
                 if self.transformer_engine_v_0_10:
-                    extra_transformer_engine_kwargs["activation"] = "swiglu" if args.swiglu else "gelu"
+                    extra_transformer_engine_kwargs["activation"] = (
+                        "swiglu" if args.swiglu else "gelu"
+                    )
                 if self.transformer_engine_v_0_11:
                     extra_transformer_engine_kwargs["normalization"] = args.normalization
-                assert config.attention_softmax_in_fp32, "TransformerEngine only supports softmax compute in FP32."
                 assert (
-                    (bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and args.fp16) == config.apply_query_key_layer_scaling
-                ), ("Unsupported config for apply_query_key_layer_scaling in TransformerEngine. If --apply-query-key-layer-scaling is "
-                    "provided, set env-var NVTE_APPLY_QK_LAYER_SCALING=1 and you must be using fp16.")
+                    config.attention_softmax_in_fp32
+                ), "TransformerEngine only supports softmax compute in FP32."
+                assert (
+                    bool(int(os.getenv("NVTE_APPLY_QK_LAYER_SCALING", "0"))) and args.fp16
+                ) == config.apply_query_key_layer_scaling, (
+                    "Unsupported config for apply_query_key_layer_scaling in TransformerEngine. If --apply-query-key-layer-scaling is "
+                    "provided, set env-var NVTE_APPLY_QK_LAYER_SCALING=1 and you must be using fp16."
+                )
                 return transformer_engine.pytorch.TransformerLayer(
                     config.hidden_size,
                     config.ffn_hidden_size,
@@ -1246,11 +1323,13 @@ def build_layer(layer_number):
                     layer_number=layer_number,
                     kv_channels=config.kv_channels,
                     self_attn_mask_type=self_attn_mask_type.name,
-                    tp_group=mpu.get_tensor_model_parallel_group() if mpu.is_initialized() else None,
+                    tp_group=(
+                        mpu.get_tensor_model_parallel_group() if mpu.is_initialized() else None
+                    ),
                     tp_size=mpu.get_tensor_model_parallel_world_size(),
-                    get_rng_state_tracker=get_cuda_rng_tracker
-                    if get_cuda_rng_tracker().is_initialized()
-                    else None,
+                    get_rng_state_tracker=(
+                        get_cuda_rng_tracker if get_cuda_rng_tracker().is_initialized() else None
+                    ),
                     fuse_wgrad_accumulation=config.gradient_accumulation_fusion,
                     seq_length=args.seq_length,
                     micro_batch_size=args.micro_batch_size,
@@ -1262,12 +1341,13 @@ def build_layer(layer_number):
                     drop_path_rate=self.drop_path_rates[layer_number - 1],
                     set_parallel_mode=True,
                     fuse_qkv_params=True,
-                    **extra_transformer_engine_kwargs)
+                    **extra_transformer_engine_kwargs,
+                )
 
         if config.virtual_pipeline_model_parallel_size is not None:
-            assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, \
-                'num_layers_per_stage must be divisible by ' \
-                'virtual_pipeline_model_parallel_size'
+            assert config.num_layers % config.virtual_pipeline_model_parallel_size == 0, (
+                'num_layers_per_stage must be divisible by ' 'virtual_pipeline_model_parallel_size'
+            )
             # Number of layers in each model chunk is the number of layers in the stage,
             # divided by the number of model chunks in a stage.
             self.num_layers = self.num_layers // config.virtual_pipeline_model_parallel_size
@@ -1280,8 +1360,8 @@ def build_layer(layer_number):
             # Stage 0: [0, 1]  [4, 5]
             # Stage 1: [2, 3]  [6, 7]
             offset = mpu.get_virtual_pipeline_model_parallel_rank() * (
-                config.num_layers // config.virtual_pipeline_model_parallel_size) + \
-                (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
+                config.num_layers // config.virtual_pipeline_model_parallel_size
+            ) + (mpu.get_pipeline_model_parallel_rank() * self.num_layers)
         else:
             # Each stage gets a contiguous set of layers.
             offset = mpu.get_pipeline_model_parallel_rank() * self.num_layers
@@ -1296,10 +1376,11 @@ def build_layer(layer_number):
             # this, we assign a 'no-op' layer on these ranks, which will
             # disconnect the input tensor from the output tensor.
             self.num_layers = 1
-            self.layers = torch.nn.ModuleList([ NoopTransformerLayer(1) ])
+            self.layers = torch.nn.ModuleList([NoopTransformerLayer(1)])
         else:
             self.layers = torch.nn.ModuleList(
-                [build_layer(i + 1 + offset) for i in range(self.num_layers)])
+                [build_layer(i + 1 + offset) for i in range(self.num_layers)]
+            )
 
         if self.post_process and self.post_norm:
             # Final layer norm before output.
@@ -1308,10 +1389,17 @@ def build_layer(layer_number):
     def _get_layer(self, layer_number):
         return self.layers[layer_number]
 
-    def _checkpointed_forward(self, hidden_states, attention_mask,
-                              encoder_output, enc_dec_attn_mask,
-                              rotary_pos_emb, is_first_microbatch):
+    def _checkpointed_forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_output,
+        enc_dec_attn_mask,
+        rotary_pos_emb,
+        is_first_microbatch,
+    ):
         """Forward method with activation checkpointing."""
+
         def custom(start, end):
             def custom_forward(*args, **kwargs):
                 x_, *args = args
@@ -1319,6 +1407,7 @@ def custom_forward(*args, **kwargs):
                     layer = self._get_layer(index)
                     x_ = layer(x_, *args, **kwargs)
                 return x_
+
             return custom_forward
 
         te_forward_kwargs = {}
@@ -1339,15 +1428,26 @@ def custom_forward(*args, **kwargs):
                         self.distribute_saved_activations,
                         tensor_parallel.get_cuda_rng_tracker,
                         mpu.get_tensor_model_parallel_group(),
-                        hidden_states, attention_mask, encoder_output,
-                        enc_dec_attn_mask, **te_forward_kwargs)
+                        hidden_states,
+                        attention_mask,
+                        encoder_output,
+                        enc_dec_attn_mask,
+                        **te_forward_kwargs,
+                    )
                 else:
                     hidden_states = tensor_parallel.checkpoint(
                         custom(l, l + self.recompute_num_layers),
                         self.distribute_saved_activations,
-                        hidden_states, attention_mask,
-                        encoder_output, enc_dec_attn_mask,
-                        None, None, None, None, rotary_pos_emb)
+                        hidden_states,
+                        attention_mask,
+                        encoder_output,
+                        enc_dec_attn_mask,
+                        None,
+                        None,
+                        None,
+                        None,
+                        rotary_pos_emb,
+                    )
 
                 l += self.recompute_num_layers
 
@@ -1363,25 +1463,47 @@ def custom_forward(*args, **kwargs):
                             self.distribute_saved_activations,
                             tensor_parallel.get_cuda_rng_tracker,
                             mpu.get_tensor_model_parallel_group(),
-                            hidden_states, attention_mask, encoder_output,
-                            enc_dec_attn_mask, **te_forward_kwargs)
+                            hidden_states,
+                            attention_mask,
+                            encoder_output,
+                            enc_dec_attn_mask,
+                            **te_forward_kwargs,
+                        )
                     else:
                         hidden_states = tensor_parallel.checkpoint(
                             custom(l, l + 1),
                             self.distribute_saved_activations,
-                            hidden_states, attention_mask,
-                            encoder_output, enc_dec_attn_mask,
-                            None, None, None, None, rotary_pos_emb)
+                            hidden_states,
+                            attention_mask,
+                            encoder_output,
+                            enc_dec_attn_mask,
+                            None,
+                            None,
+                            None,
+                            None,
+                            rotary_pos_emb,
+                        )
                 else:
                     if self.transformer_impl == 'transformer_engine':
                         hidden_states = custom(l, l + 1)(
-                            hidden_states, attention_mask, encoder_output,
-                            enc_dec_attn_mask, **te_forward_kwargs)
+                            hidden_states,
+                            attention_mask,
+                            encoder_output,
+                            enc_dec_attn_mask,
+                            **te_forward_kwargs,
+                        )
                     else:
                         hidden_states = custom(l, l + 1)(
-                            hidden_states, attention_mask,
-                            encoder_output, enc_dec_attn_mask,
-                            None, None, None, None, rotary_pos_emb)
+                            hidden_states,
+                            attention_mask,
+                            encoder_output,
+                            enc_dec_attn_mask,
+                            None,
+                            None,
+                            None,
+                            None,
+                            rotary_pos_emb,
+                        )
         else:
             raise ValueError("Invalid activation recompute method.")
 
@@ -1397,20 +1519,26 @@ def set_input_tensor(self, input_tensor):
         forward_step_func"""
         self.input_tensor = input_tensor
 
-    def forward(self, hidden_states, attention_mask,
-                encoder_output=None, enc_dec_attn_mask=None,
-                inference_context=None,
-                rotary_pos_emb=None,
-                *,
-                inference_params=None):
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_output=None,
+        enc_dec_attn_mask=None,
+        inference_context=None,
+        rotary_pos_emb=None,
+        *,
+        inference_params=None,
+    ):
         # hidden_states: [s, b, h]
 
         inference_context = deprecate_inference_params(inference_context, inference_params)
 
         # Checks.
         if inference_context:
-            assert self.recompute_granularity is None, \
-                'inference does not work with activation checkpointing'
+            assert (
+                self.recompute_granularity is None
+            ), 'inference does not work with activation checkpointing'
 
         if not self.pre_process:
             # See set_input_tensor()
@@ -1432,9 +1560,7 @@ def forward(self, hidden_states, attention_mask,
         #   already creates viewless tensors. That said, make_viewless_tensor()
         #   is called here to be future-proof and corner-case-proof.
         hidden_states = core.utils.make_viewless_tensor(
-            hidden_states,
-            requires_grad=True,
-            keep_graph=True,
+            hidden_states, requires_grad=True, keep_graph=True
         )
 
         # RNG context.
@@ -1447,11 +1573,13 @@ def forward(self, hidden_states, attention_mask,
         with rng_context:
             # The fp8_autocast context manager is a no-op when enabled=True
             # The if...else serves to short circuit name resolution for fp8_autocast
-            with transformer_engine.pytorch.fp8_autocast(
-                enabled=self.use_fp8,
-                fp8_recipe=self.fp8_recipe,
-                fp8_group=self.fp8_group
-            ) if self.use_fp8 else nullcontext():
+            with (
+                transformer_engine.pytorch.fp8_autocast(
+                    enabled=self.use_fp8, fp8_recipe=self.fp8_recipe, fp8_group=self.fp8_group
+                )
+                if self.use_fp8
+                else nullcontext()
+            ):
                 # Determine if the current iteration is first microbatch
                 if self.num_microbatches_in_previous_step != get_num_microbatches():
                     self.microbatch_count = 0
@@ -1460,12 +1588,14 @@ def forward(self, hidden_states, attention_mask,
 
                 # Forward pass.
                 if self.recompute_granularity == 'full':
-                    hidden_states = self._checkpointed_forward(hidden_states,
-                                                               attention_mask,
-                                                               encoder_output,
-                                                               enc_dec_attn_mask,
-                                                               rotary_pos_emb,
-                                                               is_first_microbatch)
+                    hidden_states = self._checkpointed_forward(
+                        hidden_states,
+                        attention_mask,
+                        encoder_output,
+                        enc_dec_attn_mask,
+                        rotary_pos_emb,
+                        is_first_microbatch,
+                    )
                 else:
                     forward_kwargs = {
                         'encoder_output': encoder_output,
@@ -1487,10 +1617,7 @@ def forward(self, hidden_states, attention_mask,
                     for index in range(self.num_layers):
                         layer = self._get_layer(index)
 
-                        hidden_states = layer(
-                            hidden_states,
-                            attention_mask,
-                            **forward_kwargs)
+                        hidden_states = layer(hidden_states, attention_mask, **forward_kwargs)
 
                 # Skip counter update for eval and activation checkpointing
                 if torch.is_grad_enabled() and self.training:
diff --git a/megatron/rl/agent/api.py b/megatron/rl/agent/api.py
index 2f3a31db445..9ba0d6a1354 100644
--- a/megatron/rl/agent/api.py
+++ b/megatron/rl/agent/api.py
@@ -13,11 +13,7 @@
 from megatron.core.utils import trace_async_exceptions
 
 from ..__init__ import Request, TypeLookupable
-from ..inference import (
-    InferenceInterface,
-    LLMChatMessage,
-    ReturnsRaw,
-)
+from ..inference import InferenceInterface, LLMChatMessage, ReturnsRaw
 
 
 class AgentBaseModel(BaseModel, extra='allow'):
@@ -218,10 +214,12 @@ async def get_grouped_rollouts(self, request: GroupedRolloutRequest):
         # When it's 1, the semaphore is a no-op.
         groups_per_worker = request.num_groups
         if groups_per_worker > 1:
-            assert not request.filter_groups_with_same_reward, \
-                "Cannot use filter_groups_with_same_reward with num_groups > 1."
-        assert self.parallel_generation_tasks >= groups_per_worker, \
-            f"{self.parallel_generation_tasks=} must be >= {groups_per_worker=}"
+            assert (
+                not request.filter_groups_with_same_reward
+            ), "Cannot use filter_groups_with_same_reward with num_groups > 1."
+        assert (
+            self.parallel_generation_tasks >= groups_per_worker
+        ), f"{self.parallel_generation_tasks=} must be >= {groups_per_worker=}"
         num_workers = self.parallel_generation_tasks // groups_per_worker
         unused = self.parallel_generation_tasks % groups_per_worker
         if unused:
@@ -252,10 +250,9 @@ async def generate_task():
                 batch_id = submitted_groups // groups_per_worker
                 submitted_groups += groups_per_worker
                 if groups_per_worker > 1:
-                    await asyncio.gather(*[
-                        generate_and_enqueue(batch_id, i)
-                        for i in range(groups_per_worker)
-                    ])
+                    await asyncio.gather(
+                        *[generate_and_enqueue(batch_id, i) for i in range(groups_per_worker)]
+                    )
                 else:
                     if not await generate_and_enqueue(batch_id, 0):
                         submitted_groups -= groups_per_worker
diff --git a/megatron/rl/agent/reward_only_agent.py b/megatron/rl/agent/reward_only_agent.py
index 9755da48112..8323a30ade6 100644
--- a/megatron/rl/agent/reward_only_agent.py
+++ b/megatron/rl/agent/reward_only_agent.py
@@ -6,12 +6,7 @@
 import numpy as np
 from tqdm.asyncio import tqdm
 
-from ..inference import (
-    InferenceResponse,
-    LLMChatMessage,
-    ReturnsRaw,
-    ReturnsTokens,
-)
+from ..inference import InferenceResponse, LLMChatMessage, ReturnsRaw, ReturnsTokens
 from .api import (
     EvaluationAgent,
     EvaluationRequest,
@@ -43,9 +38,7 @@ def get_dataset(self, validation: bool = False):
         """Return validation or train dataset."""
         raise NotImplementedError("Derived class must implement get_dataset.")
 
-    async def get_reward(
-        self, response: str, golden: Any, finish_reason: str
-    ) -> float:
+    async def get_reward(self, response: str, golden: Any, finish_reason: str) -> float:
         """Given the LLM response and the golden data, provide a reward."""
         raise NotImplementedError("Derived class must implement get_reward")
 
@@ -143,8 +136,15 @@ async def group_rollout(self, request: GroupedRolloutRequest) -> list[Rollout]:
             prompt, request.generation_args
         )
 
-        responses = await asyncio.gather(*[request.inference_interface.agenerate(inference_request) for _ in range(request.rollouts_per_group)])
-        return [await self.rollout_from_response(request, response, golden) for response in responses]
+        responses = await asyncio.gather(
+            *[
+                request.inference_interface.agenerate(inference_request)
+                for _ in range(request.rollouts_per_group)
+            ]
+        )
+        return [
+            await self.rollout_from_response(request, response, golden) for response in responses
+        ]
 
     async def _evaluation(
         self, prompt: str, golden: Any, request: EvaluationRequest
diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py
index cec693a138f..0562ce7ed9f 100644
--- a/megatron/rl/inference/megatron.py
+++ b/megatron/rl/inference/megatron.py
@@ -10,6 +10,7 @@
 
 try:
     import h2  # noqa: F401
+
     use_http2 = True
 except ImportError:
     use_http2 = False
@@ -33,6 +34,7 @@
 logger = logging.getLogger(__name__)
 logging.getLogger("httpx").setLevel(logging.WARNING)
 
+
 class MegatronLocal(InferenceServer, ReturnsTokens, ReturnsRaw):
     """Interface to use MCoreEngine directly as an inference engine."""
 
@@ -99,11 +101,13 @@ async def launch(cls, model: GPTModel, **kwargs):
 
         inference_engine: DynamicInferenceEngine = get_dynamic_inference_engine(model=model)
         dp_addr = await inference_engine.start_listening_to_data_parallel_coordinator(
-            inference_coordinator_port=41521, launch_inference_coordinator=True,
+            inference_coordinator_port=41521, launch_inference_coordinator=True
         )
 
         if dist.get_rank() == 0:
-            from megatron.core.inference.text_generation_server.dynamic_text_gen_server import start_text_gen_server
+            from megatron.core.inference.text_generation_server.dynamic_text_gen_server import (
+                start_text_gen_server,
+            )
 
             client = InferenceClient(inference_coordinator_address=dp_addr)
             client.start()
@@ -126,21 +130,18 @@ async def launch(cls, model: GPTModel, **kwargs):
             args.rl_kv_cache_management_mode
         )
 
-        concurrency_limit = args.grpo_prompts_per_step * args.grpo_group_size * args.rl_parallel_generation_tasks
-        custom_limits = httpx.Limits(
-            max_connections=concurrency_limit,
-            max_keepalive_connections=concurrency_limit,
+        concurrency_limit = (
+            args.grpo_prompts_per_step * args.grpo_group_size * args.rl_parallel_generation_tasks
         )
-        http_client = DefaultAioHttpClient(
-            timeout=None,
-            limits=custom_limits,
-            http2=use_http2
+        custom_limits = httpx.Limits(
+            max_connections=concurrency_limit, max_keepalive_connections=concurrency_limit
         )
+        http_client = DefaultAioHttpClient(timeout=None, limits=custom_limits, http2=use_http2)
 
         launched_server._openai_client = AsyncOpenAI(
             base_url=f"http://{launched_server.host}:{launched_server.port}",
             api_key="NONE",
-            http_client=http_client
+            http_client=http_client,
         )
 
         return launched_server
@@ -163,7 +164,10 @@ async def kill(self):
             self._client.stop()
 
         if dist.get_rank() == 0:
-            from megatron.core.inference.text_generation_server.dynamic_text_gen_server import stop_text_gen_server
+            from megatron.core.inference.text_generation_server.dynamic_text_gen_server import (
+                stop_text_gen_server,
+            )
+
             stop_text_gen_server()
 
     def set_generation_epoch(self, generation_epoch: int):
diff --git a/megatron/rl/rl_utils.py b/megatron/rl/rl_utils.py
index 8a564315dc3..a159952c6ce 100644
--- a/megatron/rl/rl_utils.py
+++ b/megatron/rl/rl_utils.py
@@ -1,20 +1,20 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+import copy
 import gc
 
-import copy
-from functools import partial
 # Keep this to make the env registered.
 import itertools
-import math
-import logging
 import json
+import logging
+import math
 import os
 from collections import Counter, defaultdict
 from contextlib import contextmanager, nullcontext
 from dataclasses import dataclass
+from functools import partial
 from pathlib import Path
-from typing import Any, Dict, Iterator, List, Optional 
+from typing import Any, Dict, Iterator, List, Optional
 
 import numpy as np
 import torch
@@ -22,46 +22,39 @@
 import yaml
 from torch.utils.data import DataLoader, TensorDataset
 from torch.utils.tensorboard import SummaryWriter
+from wandb import wandb_run
 
 from megatron.core import mpu
-from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.full_cuda_graph import FullCudaGraphWrapper
+from megatron.core.inference.contexts.dynamic_context import HAVE_TORCH_MEMORY_SAVER
+from megatron.core.inference.unified_memory import (
+    advise_managed_module_parameters_preferred_location,
+    prefetch_managed_module_parameters,
+)
+from megatron.core.inference.utils import device_memory_summary, set_decode_expert_padding
 from megatron.core.models.common.language_module.language_module import LanguageModule
 from megatron.core.num_microbatches_calculator import reconfigure_num_microbatches_calculator
 from megatron.core.optimizer import MegatronOptimizer
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.pipeline_parallel import get_forward_backward_func
-from megatron.core.pipeline_parallel.utils import is_pp_last_stage, get_pp_last_rank
+from megatron.core.pipeline_parallel.utils import get_pp_last_rank, is_pp_last_stage
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.rerun_state_machine import RerunDataIterator
+from megatron.core.resharding.refit import swap_model_weights
 from megatron.core.tokenizers import MegatronTokenizer
 from megatron.core.tokenizers.text.libraries.huggingface_tokenizer import HuggingFaceTokenizer
 from megatron.core.transformer.cuda_graphs import _CudagraphGlobalRecord
-from megatron.core.transformer.enums import CudaGraphScope
-from megatron.core.transformer.utils import (
-    toggle_cuda_graphs,
-    transition_moe_cudagraphs,
-)
-from megatron.core.inference.utils import set_decode_expert_padding
-from megatron.core.resharding.refit import swap_model_weights
-from megatron.core.inference.unified_memory import (
-    advise_managed_module_parameters_preferred_location,
-    prefetch_managed_module_parameters,
+from megatron.core.transformer.custom_layers.batch_invariant_kernels import (
+    is_batch_invariant_mode_enabled,
 )
-from megatron.core.inference.utils import device_memory_summary
-from megatron.core.utils import get_asyncio_loop, log_single_rank
-from megatron.rl.sequence_packing_utils import (
-    get_microbatch_dataloader,
-    pack_inference_logprobs,
-    compute_packed_inference_logprobs_stats,
-    pack_all_trajectories,
-    load_packed_data_by_index,
-    get_sequence_packing_tensorboard_metrics,
-    get_sequence_packing_log_info,
-    get_default_packed_seq_params,
-    get_packing_actual_tokens,
-    get_packing_compute_tokens,
-    get_packing_efficiency,
-    get_packing_avg_seq_length,
-    update_microbatch_calculator,
+from megatron.core.transformer.enums import CudaGraphScope
+from megatron.core.transformer.utils import toggle_cuda_graphs, transition_moe_cudagraphs
+from megatron.core.utils import (
+    get_asyncio_loop,
+    get_attr_wrapped_model,
+    get_pg_rank,
+    get_pg_size,
+    log_single_rank,
 )
 from megatron.rl.agent.api import (
     EvaluationRequest,
@@ -78,6 +71,21 @@
 from megatron.rl.inference.megatron import MegatronLocal
 from megatron.rl.logging import LOG_DIR as lang_rl_log_dir
 from megatron.rl.logging import log as lang_rl_log
+from megatron.rl.sequence_packing_utils import (
+    compute_packed_inference_logprobs_stats,
+    get_default_packed_seq_params,
+    get_microbatch_dataloader,
+    get_packing_actual_tokens,
+    get_packing_avg_seq_length,
+    get_packing_compute_tokens,
+    get_packing_efficiency,
+    get_sequence_packing_log_info,
+    get_sequence_packing_tensorboard_metrics,
+    load_packed_data_by_index,
+    pack_all_trajectories,
+    pack_inference_logprobs,
+    update_microbatch_calculator,
+)
 from megatron.rl.server.inference.inference_interface_server import InferenceInterfaceServer
 from megatron.training.global_vars import (
     get_args,
@@ -91,14 +99,7 @@
     print_rank_0,
     unwrap_model,
 )
-from megatron.core.utils import get_pg_rank, get_pg_size, get_attr_wrapped_model
-from megatron.core.process_groups_config import ProcessGroupCollection
-from wandb import wandb_run
-from megatron.core.transformer.custom_layers.batch_invariant_kernels import (
-    is_batch_invariant_mode_enabled,
-)
 
-from megatron.core.inference.contexts.dynamic_context import HAVE_TORCH_MEMORY_SAVER
 if HAVE_TORCH_MEMORY_SAVER:
     from torch_memory_saver import torch_memory_saver
 
@@ -169,15 +170,21 @@ def _maybe_prefetch_separate_inference_model_weights(model_core, *, to_cpu: bool
     device = -1 if to_cpu else int(torch.cuda.current_device())
     # Note: include_buffers=False because buffers created with explicit device= in register_buffer()
     # are not allocated via the UVM mempool and will fail UVM operations. Only parameters are UVM-allocated.
-    advise_managed_module_parameters_preferred_location(model_core, device=device, include_buffers=False)
+    advise_managed_module_parameters_preferred_location(
+        model_core, device=device, include_buffers=False
+    )
     nbytes = prefetch_managed_module_parameters(model_core, device=device, include_buffers=False)
     # Ensure pages are resident before we enter CUDA-graph capture / inference, or before training continues.
     torch.cuda.synchronize()
 
     if to_cpu:
-        print_rank_0(f"[Rank 0] offloaded {nbytes / 1024**2:.2f} MB of separate RL inference model weights to CPU (other ranks may vary)")
+        print_rank_0(
+            f"[Rank 0] offloaded {nbytes / 1024**2:.2f} MB of separate RL inference model weights to CPU (other ranks may vary)"
+        )
     else:
-        print_rank_0(f"[Rank 0] prefetched {nbytes / 1024**2:.2f} MB of separate RL inference model weights to GPU (other ranks may vary)")
+        print_rank_0(
+            f"[Rank 0] prefetched {nbytes / 1024**2:.2f} MB of separate RL inference model weights to GPU (other ranks may vary)"
+        )
 
 
 def verify_model_weights_swap(
@@ -222,8 +229,11 @@ def verify_model_weights_swap(
     # Generate deterministic test input - same across ALL ranks
     torch.manual_seed(1234)
     test_tokens = torch.randint(
-        low=0, high=actual_vocab_size, size=(batch_size, actual_seq_len),
-        device=device, dtype=torch.long
+        low=0,
+        high=actual_vocab_size,
+        size=(batch_size, actual_seq_len),
+        device=device,
+        dtype=torch.long,
     )
     test_position_ids = (
         torch.arange(actual_seq_len, device=device, dtype=torch.long)
@@ -244,25 +254,23 @@ def verify_model_weights_swap(
     try:
         with torch.no_grad():
             train_output = train_lm(
-                test_tokens, test_position_ids, test_attention_mask,
-                runtime_gather_output=True
+                test_tokens, test_position_ids, test_attention_mask, runtime_gather_output=True
             )
 
             inf_output = inf_lm(
-                test_tokens, test_position_ids, test_attention_mask,
-                runtime_gather_output=True
+                test_tokens, test_position_ids, test_attention_mask, runtime_gather_output=True
             )
 
         # Only check on ranks that have output (last PP stage)
         if train_output is not None and inf_output is not None:
-            assert train_output.shape == inf_output.shape, (
-                f"Output shape mismatch: train={train_output.shape}, infer={inf_output.shape}"
-            )
-            
+            assert (
+                train_output.shape == inf_output.shape
+            ), f"Output shape mismatch: train={train_output.shape}, infer={inf_output.shape}"
+
             max_diff = (train_output - inf_output).abs().max().item()
-            assert torch.allclose(train_output, inf_output, atol=atol, rtol=rtol), (
-                f"Forward pass outputs do not match: max_diff={max_diff:.6e}, atol={atol}, rtol={rtol}"
-            )
+            assert torch.allclose(
+                train_output, inf_output, atol=atol, rtol=rtol
+            ), f"Forward pass outputs do not match: max_diff={max_diff:.6e}, atol={atol}, rtol={rtol}"
 
     finally:
         # Restore training state
@@ -272,14 +280,13 @@ def verify_model_weights_swap(
             inf_core.train()
 
 
-
 @dataclass(slots=True)
 class RolloutStats:
-    rewards: list[list[float]] # inner list is for a group
-    env_ids: list[str] # same length as len(rewards)
-    turn_lens: list[list[int]] # token lengths of turns, grouped.
-    traj_lens: list[list[int]] # all turns comprise one trajectory.
-    num_turns: None | list[list[int]] # num_turns per traj
+    rewards: list[list[float]]  # inner list is for a group
+    env_ids: list[str]  # same length as len(rewards)
+    turn_lens: list[list[int]]  # token lengths of turns, grouped.
+    traj_lens: list[list[int]]  # all turns comprise one trajectory.
+    num_turns: None | list[list[int]]  # num_turns per traj
     advantages: None | list[list[float]]
     min_piold_to_inf_prob: None | float
     max_piold_to_inf_prob: None | float
@@ -341,7 +348,9 @@ def get_rl_runtime_state():
     return _rl_runtime_state
 
 
-def log_rl_throughput_metrics(args, batch_size, elapsed_time_per_iteration, iteration, wandb_writer):
+def log_rl_throughput_metrics(
+    args, batch_size, elapsed_time_per_iteration, iteration, wandb_writer
+):
     """Compute, log, and store RL token throughput metrics.
 
     Returns a string fragment to append to the training log line.
@@ -394,21 +403,30 @@ def log_rl_throughput_metrics(args, batch_size, elapsed_time_per_iteration, iter
     # Log throughput metrics to wandb
     if wandb_writer is not None:
         if tokens_per_sec is not None:
-            wandb_writer.log({
-                'throughput/tokens_per_sec': tokens_per_sec,
-                'throughput/tokens_per_sec_per_gpu': tokens_per_sec_per_gpu,
-            }, iteration)
+            wandb_writer.log(
+                {
+                    'throughput/tokens_per_sec': tokens_per_sec,
+                    'throughput/tokens_per_sec_per_gpu': tokens_per_sec_per_gpu,
+                },
+                iteration,
+            )
         if compute_tokens_per_sec is not None:
-            wandb_writer.log({
-                'throughput/compute_tokens_per_sec': compute_tokens_per_sec,
-                'throughput/compute_tokens_per_sec_per_gpu': compute_tokens_per_sec_per_gpu,
-            }, iteration)
+            wandb_writer.log(
+                {
+                    'throughput/compute_tokens_per_sec': compute_tokens_per_sec,
+                    'throughput/compute_tokens_per_sec_per_gpu': compute_tokens_per_sec_per_gpu,
+                },
+                iteration,
+            )
         if actual_tokens_per_sec is not None:
-            wandb_writer.log({
-                'throughput/actual_tokens_per_sec': actual_tokens_per_sec,
-                'throughput/actual_tokens_per_sec_per_gpu': actual_tokens_per_sec_per_gpu,
-                'throughput/packing_efficiency': packing_efficiency,
-            }, iteration)
+            wandb_writer.log(
+                {
+                    'throughput/actual_tokens_per_sec': actual_tokens_per_sec,
+                    'throughput/actual_tokens_per_sec_per_gpu': actual_tokens_per_sec_per_gpu,
+                    'throughput/packing_efficiency': packing_efficiency,
+                },
+                iteration,
+            )
 
     # Store derived throughput metrics on RLRuntimeState so that
     # downstream consumers (e.g. RLProfiler) can read them.
@@ -519,7 +537,9 @@ def align_unpacked_inference_logprobs(
             truncated_mask = torch.nn.functional.pad(truncated_mask, (0, pad_size), value=False)
 
     # Sanity check: Two probability values cannot be more than 1.0 apart
-    abs_diffs = (old_logprobs_for_data.exp() - padded_inference_logprobs.exp()).abs()[truncated_mask]
+    abs_diffs = (old_logprobs_for_data.exp() - padded_inference_logprobs.exp()).abs()[
+        truncated_mask
+    ]
     assert all(abs_diffs <= 1.0)
 
     # Update group statistics using common helper
@@ -543,8 +563,7 @@ def get_agent(args, parallel_generation_tasks: int | None = None):
         config = yaml.safe_load(f)
 
     return WeightedMultiTask.from_config(
-        config,
-        parallel_generation_tasks=parallel_generation_tasks,
+        config, parallel_generation_tasks=parallel_generation_tasks
     )
 
 
@@ -556,10 +575,8 @@ def get_inference_interface(args, loop, model):
     if _INFERENCE_INTERFACE is None:
         _INFERENCE_INTERFACE = loop.run_until_complete(
             MegatronLocal.launch(
-                model[0],
-                host='0.0.0.0',
-                port=8294,
-                verbose=args.inference_text_gen_server_logging)
+                model[0], host='0.0.0.0', port=8294, verbose=args.inference_text_gen_server_logging
+            )
         )
     return _INFERENCE_INTERFACE
 
@@ -590,7 +607,11 @@ def get_rollout_generator(args, inference_interface, n_prompts, samples_per_grou
 
 
 def get_environment_rollouts(
-    model: LanguageModule, inference_model: LanguageModule, optimizer: MegatronOptimizer, n_prompts: int, samples_per_group: int
+    model: LanguageModule,
+    inference_model: LanguageModule,
+    optimizer: MegatronOptimizer,
+    n_prompts: int,
+    samples_per_group: int,
 ):
     """Sample environment rollouts from an LLM.
 
@@ -613,7 +634,8 @@ def get_environment_rollouts(
                     model[0].offload_grad_buffers()
             else:
                 logger.warning(
-                    "Gradient buffers will not be offloaded when training cudagraphs are enabled!")
+                    "Gradient buffers will not be offloaded when training cudagraphs are enabled!"
+                )
             with nvtx_range("rl/offload/optimizer-state", time=True):
                 optimizer.offload_to_cpu()
 
@@ -628,17 +650,14 @@ def get_environment_rollouts(
         swap_model_weights(model, inference_model, args.refit_method)
         if args.rl_verify_model_weights_swap:
             verify_model_weights_swap(
-                train_model=model,
-                inference_model=inference_model,
-                atol=.1,
-                rtol=5e-4,
+                train_model=model, inference_model=inference_model, atol=0.1, rtol=5e-4
             )
     else:
         inference_model = model
 
     inference_pg_collection = get_attr_wrapped_model(inference_model[0], "pg_collection")
     pg_size = get_pg_size(inference_pg_collection.ep)
-    assert (n_prompts % pg_size == 0), f"{n_prompts=} must be divisible by {pg_size=}"
+    assert n_prompts % pg_size == 0, f"{n_prompts=} must be divisible by {pg_size=}"
 
     with nvtx_range("rl/rollout-collection", time=True):
         loop = get_asyncio_loop()
@@ -646,7 +665,7 @@ def get_environment_rollouts(
             inference_model,
             optimizer,
             args.cuda_graph_impl,
-            False, # offload optimizer during rollout collection is handled above
+            False,  # offload optimizer during rollout collection is handled above
             training_model=model if has_separate_inference_model else None,
         ) as inference_interface:
 
@@ -671,7 +690,11 @@ def get_environment_rollouts(
                     # In deterministic mode, sort rollouts by problem_id for consistent ordering
                     # regardless of completion order due to system timing jitter.
                     if torch.are_deterministic_algorithms_enabled():
-                        rollouts.sort(key=lambda group: group[0].problem_id if group and group[0].problem_id else "")
+                        rollouts.sort(
+                            key=lambda group: (
+                                group[0].problem_id if group and group[0].problem_id else ""
+                            )
+                        )
                     if not args.rl_partial_rollouts:
                         while True:
                             try:
@@ -742,7 +765,9 @@ def selective_log_softmax(logits, index):
     return per_token_logps
 
 
-def get_logprobs(model, tokens, position_ids, no_grad=False, sequence_packing=False, packed_seq_params=None):
+def get_logprobs(
+    model, tokens, position_ids, no_grad=False, sequence_packing=False, packed_seq_params=None
+):
     """Get sequence logprobs from their token ids.
 
     Args:
@@ -850,7 +875,7 @@ def calculate_grpo_advantages(rewards: list[list[float]], num_turns: list[list[i
 
 
 def compute_group_stats(
-    rollouts: GroupedRollouts, tokenizer: MegatronTokenizer, seq_len: int,
+    rollouts: GroupedRollouts, tokenizer: MegatronTokenizer, seq_len: int
 ) -> RolloutStats:
     """Add group-based rollout stats for logging.
 
@@ -870,7 +895,7 @@ def compute_group_stats(
     rewards = []
     env_ids = []
     group_reward_ids = []
-    num_turns = [] # num_turns per traj
+    num_turns = []  # num_turns per traj
     all_policy_epoch = []
     all_kv_cache_epoch = []
     all_completed_epochs = []
@@ -916,7 +941,7 @@ def compute_group_stats(
         all_num_evictions.append(group_num_evictions)
         traj_lens.append(group_traj_lengths)
         turn_lens.append(group_turn_lengths)
-        env_ids.append(group[0].env_id) # All rollouts in a group share the env_id by design.
+        env_ids.append(group[0].env_id)  # All rollouts in a group share the env_id by design.
         rewards.append(group_rewards)
         # https://arxiv.org/abs/2504.21233 reports that lens variance hurts.
         # Let's track this.
@@ -949,23 +974,21 @@ def compute_group_stats(
     return stats
 
 
-
 def prep_wandb_metrics(
-        wandb_writer: wandb_run.Run,
-        traj_lens: List[List[int]],
-        turn_lens: List[List[int]],
-        rewards: List[List[float]],
-        num_turns: List[List[int]],
-        advantages: List[float],
-        policy_epoch: List[List[List[int]]],
-        kv_cache_epoch: List[List[List[int]]],
-        completed_epochs: List[List[int]],
-        num_evictions: List[List[int]],
-        current_iteration: int,
-        example_group: list[TokenRollout | Rollout] | None = None,
-        tokenizer: MegatronTokenizer | None = None,
-    ):
-
+    wandb_writer: wandb_run.Run,
+    traj_lens: List[List[int]],
+    turn_lens: List[List[int]],
+    rewards: List[List[float]],
+    num_turns: List[List[int]],
+    advantages: List[float],
+    policy_epoch: List[List[List[int]]],
+    kv_cache_epoch: List[List[List[int]]],
+    completed_epochs: List[List[int]],
+    num_evictions: List[List[int]],
+    current_iteration: int,
+    example_group: list[TokenRollout | Rollout] | None = None,
+    tokenizer: MegatronTokenizer | None = None,
+):
     """Make a wandb-parseable dictionary of metrics for logging.
 
     Args:
@@ -985,46 +1008,46 @@ def prep_wandb_metrics(
     """
 
     group_table = wandb_writer.Table(
-        columns=['group_means', 'group_stds'],
-        data=[[np.mean(g), np.std(g)] for g in rewards],
+        columns=['group_means', 'group_stds'], data=[[np.mean(g), np.std(g)] for g in rewards]
     )
 
     # Per-rollout staleness (oldest token)
     rollout_policy_staleness = [current_iteration - r[0] for g in policy_epoch for r in g]
     rollout_kv_staleness = [current_iteration - r[0] for g in kv_cache_epoch for r in g]
     # Per-rollout staleness (newest token)
-    rollout_policy_last_token_staleness = [current_iteration - r[-1] for g in policy_epoch for r in g]
+    rollout_policy_last_token_staleness = [
+        current_iteration - r[-1] for g in policy_epoch for r in g
+    ]
     rollout_kv_last_token_staleness = [current_iteration - r[-1] for g in kv_cache_epoch for r in g]
     # Per-token staleness
     per_token_policy_staleness = [current_iteration - e for g in policy_epoch for r in g for e in r]
     per_token_kv_staleness = [current_iteration - e for g in kv_cache_epoch for r in g for e in r]
 
     metrics = {
-            'group_means_hist': wandb_writer.plot.histogram(
-                group_table, 'group_means', 'Group Means'
-            ),
-            'group_stds_hist': wandb_writer.plot.histogram(
-                group_table, 'group_stds', 'Group STDs'
-            ),
-            'rewards_hist': wandb_writer.plot.histogram(
-                wandb_writer.Table(
-                    columns=['reward'], data=[[r] for g in rewards for r in g]
-                ),
-                'reward', 'All Rewards'
-            ),
-            'advantages_hist': wandb_writer.plot.histogram(
-                wandb_writer.Table(
-                    columns=['advantages'], data=[[x] for x in advantages]
-                ),
-                'advantages', 'Advantages'
-            ),
-            'rollout_table': wandb_writer.Table(
-                columns=[
-                    'reward', 'traj_length', 'num_evictions',
-                    'policy_staleness', 'kv_staleness',
-                    'policy_last_token_staleness', 'kv_last_token_staleness',
-                ],
-                data=list(zip(
+        'group_means_hist': wandb_writer.plot.histogram(group_table, 'group_means', 'Group Means'),
+        'group_stds_hist': wandb_writer.plot.histogram(group_table, 'group_stds', 'Group STDs'),
+        'rewards_hist': wandb_writer.plot.histogram(
+            wandb_writer.Table(columns=['reward'], data=[[r] for g in rewards for r in g]),
+            'reward',
+            'All Rewards',
+        ),
+        'advantages_hist': wandb_writer.plot.histogram(
+            wandb_writer.Table(columns=['advantages'], data=[[x] for x in advantages]),
+            'advantages',
+            'Advantages',
+        ),
+        'rollout_table': wandb_writer.Table(
+            columns=[
+                'reward',
+                'traj_length',
+                'num_evictions',
+                'policy_staleness',
+                'kv_staleness',
+                'policy_last_token_staleness',
+                'kv_last_token_staleness',
+            ],
+            data=list(
+                zip(
                     [r for g in rewards for r in g],
                     [l for g in traj_lens for l in g],
                     [e for g in num_evictions for e in g],
@@ -1032,55 +1055,63 @@ def prep_wandb_metrics(
                     rollout_kv_staleness,
                     rollout_policy_last_token_staleness,
                     rollout_kv_last_token_staleness,
-                )),
-            ),
-            # NOTE: This table can get very large (one row per token across all rollouts).
-            'per_token_table': wandb_writer.Table(
-                columns=['policy_staleness', 'kv_staleness'],
-                data=list(zip(per_token_policy_staleness, per_token_kv_staleness)),
-            ),
-            'mean_turn_length': np.mean([np.mean(g) for g in turn_lens]),
-            'mean_turn_length_std': np.mean([np.std(g) for g in turn_lens]),
-            'max_turn_length': max([max(g) for g in turn_lens]),
-            'min_turn_length': min([min(g) for g in turn_lens]),
-            'mean_traj_length': np.mean([np.mean(g) for g in traj_lens]),
-            'mean_traj_length_std': np.mean([np.std(g) for g in traj_lens]),
-            'max_traj_length': max([max(g) for g in traj_lens]),
-            'min_traj_length': min([min(g) for g in traj_lens]),
-            'mean_num_turns': np.mean([np.mean(g) for g in num_turns]),
-            'max_num_turns': max([max(g) for g in num_turns]),
-            'min_num_turns': min([min(g) for g in num_turns]),
-            'mean_reward': np.mean([np.mean(g) for g in rewards]),
-            'mean_advantage': np.mean(advantages),
-            'nonzero_groups_ratio': np.count_nonzero(advantages)
-            / len(advantages),
-            'mean_policy_staleness': np.mean(rollout_policy_staleness),
-            'max_policy_staleness': max(rollout_policy_staleness),
-            'min_policy_staleness': min(rollout_policy_staleness),
-            'mean_kv_cache_staleness': np.mean(rollout_kv_staleness),
-            'max_kv_cache_staleness': max(rollout_kv_staleness),
-            'min_kv_cache_staleness': min(rollout_kv_staleness),
-            'mean_policy_last_token_staleness': np.mean(rollout_policy_last_token_staleness),
-            'max_policy_last_token_staleness': max(rollout_policy_last_token_staleness),
-            'min_policy_last_token_staleness': min(rollout_policy_last_token_staleness),
-            'mean_kv_cache_last_token_staleness': np.mean(rollout_kv_last_token_staleness),
-            'max_kv_cache_last_token_staleness': max(rollout_kv_last_token_staleness),
-            'min_kv_cache_last_token_staleness': min(rollout_kv_last_token_staleness),
-            'total_eviction_count': sum([sum(g) for g in num_evictions]),
-            'max_num_evictions': max([max(g) for g in num_evictions]),
-            'mean_completion_gap': np.mean([current_iteration - s for g in completed_epochs for s in g]),
-            'per_token_policy_staleness_hist': wandb_writer.plot.histogram(
-                wandb_writer.Table(columns=['staleness'], data=[[s] for s in per_token_policy_staleness]),
-                'staleness', 'Per-Token Policy Staleness'
+                )
             ),
-            'per_token_kv_staleness_hist': wandb_writer.plot.histogram(
-                wandb_writer.Table(columns=['staleness'], data=[[s] for s in per_token_kv_staleness]),
-                'staleness', 'Per-Token KV Cache Staleness'
+        ),
+        # NOTE: This table can get very large (one row per token across all rollouts).
+        'per_token_table': wandb_writer.Table(
+            columns=['policy_staleness', 'kv_staleness'],
+            data=list(zip(per_token_policy_staleness, per_token_kv_staleness)),
+        ),
+        'mean_turn_length': np.mean([np.mean(g) for g in turn_lens]),
+        'mean_turn_length_std': np.mean([np.std(g) for g in turn_lens]),
+        'max_turn_length': max([max(g) for g in turn_lens]),
+        'min_turn_length': min([min(g) for g in turn_lens]),
+        'mean_traj_length': np.mean([np.mean(g) for g in traj_lens]),
+        'mean_traj_length_std': np.mean([np.std(g) for g in traj_lens]),
+        'max_traj_length': max([max(g) for g in traj_lens]),
+        'min_traj_length': min([min(g) for g in traj_lens]),
+        'mean_num_turns': np.mean([np.mean(g) for g in num_turns]),
+        'max_num_turns': max([max(g) for g in num_turns]),
+        'min_num_turns': min([min(g) for g in num_turns]),
+        'mean_reward': np.mean([np.mean(g) for g in rewards]),
+        'mean_advantage': np.mean(advantages),
+        'nonzero_groups_ratio': np.count_nonzero(advantages) / len(advantages),
+        'mean_policy_staleness': np.mean(rollout_policy_staleness),
+        'max_policy_staleness': max(rollout_policy_staleness),
+        'min_policy_staleness': min(rollout_policy_staleness),
+        'mean_kv_cache_staleness': np.mean(rollout_kv_staleness),
+        'max_kv_cache_staleness': max(rollout_kv_staleness),
+        'min_kv_cache_staleness': min(rollout_kv_staleness),
+        'mean_policy_last_token_staleness': np.mean(rollout_policy_last_token_staleness),
+        'max_policy_last_token_staleness': max(rollout_policy_last_token_staleness),
+        'min_policy_last_token_staleness': min(rollout_policy_last_token_staleness),
+        'mean_kv_cache_last_token_staleness': np.mean(rollout_kv_last_token_staleness),
+        'max_kv_cache_last_token_staleness': max(rollout_kv_last_token_staleness),
+        'min_kv_cache_last_token_staleness': min(rollout_kv_last_token_staleness),
+        'total_eviction_count': sum([sum(g) for g in num_evictions]),
+        'max_num_evictions': max([max(g) for g in num_evictions]),
+        'mean_completion_gap': np.mean(
+            [current_iteration - s for g in completed_epochs for s in g]
+        ),
+        'per_token_policy_staleness_hist': wandb_writer.plot.histogram(
+            wandb_writer.Table(
+                columns=['staleness'], data=[[s] for s in per_token_policy_staleness]
             ),
+            'staleness',
+            'Per-Token Policy Staleness',
+        ),
+        'per_token_kv_staleness_hist': wandb_writer.plot.histogram(
+            wandb_writer.Table(columns=['staleness'], data=[[s] for s in per_token_kv_staleness]),
+            'staleness',
+            'Per-Token KV Cache Staleness',
+        ),
     }
     if example_group:
         if tokenizer is None:
-            raise ValueError("If you provide an example group to log, you need to provide a tokenizer too.")
+            raise ValueError(
+                "If you provide an example group to log, you need to provide a tokenizer too."
+            )
         metrics['rollouts'] = wandb_writer.Table(
             columns=['Trajectories', 'Tokens', 'Rewards'],
             rows=[
@@ -1089,7 +1120,8 @@ def prep_wandb_metrics(
                     r.trajectory,
                     r.reward,
                 ]
-                for r in example_group for turn in r.trajectory
+                for r in example_group
+                for turn in r.trajectory
             ],
         )
     return metrics
@@ -1113,7 +1145,9 @@ def maybe_log_training_metrics(
     wandb_writer = get_wandb_writer()
     tb_writer = get_tensorboard_writer()
     if tb_writer:
-        tb_writer.add_scalar('mean_reward', np.mean([np.mean(g) for g in group_stats.rewards]), current_iteration)
+        tb_writer.add_scalar(
+            'mean_reward', np.mean([np.mean(g) for g in group_stats.rewards]), current_iteration
+        )
     if not wandb_writer:
         return
 
@@ -1140,10 +1174,19 @@ def maybe_log_training_metrics(
     completed_epochs = group_stats.completed_epochs
     num_evictions = group_stats.num_evictions
 
-    metrics = metrics | prep_wandb_metrics(wandb_writer=wandb_writer,
-        traj_lens=traj_lens, turn_lens=turn_lens, rewards=rewards, num_turns=num_turns, advantages=advantages,
-        policy_epoch=policy_epoch, kv_cache_epoch=kv_cache_epoch, completed_epochs=completed_epochs,
-        num_evictions=num_evictions, current_iteration=current_iteration)
+    metrics = metrics | prep_wandb_metrics(
+        wandb_writer=wandb_writer,
+        traj_lens=traj_lens,
+        turn_lens=turn_lens,
+        rewards=rewards,
+        num_turns=num_turns,
+        advantages=advantages,
+        policy_epoch=policy_epoch,
+        kv_cache_epoch=kv_cache_epoch,
+        completed_epochs=completed_epochs,
+        num_evictions=num_evictions,
+        current_iteration=current_iteration,
+    )
     env_stats = lambda cont, idx: [cont[i] for i in idx]
     group_turn_counts = [sum(nt) for nt in num_turns]
 
@@ -1157,7 +1200,9 @@ def maybe_log_training_metrics(
             end = st + group_turn_counts[i]
             env_advantages.extend(advantages[st:end])
 
-        env_metrics = prep_wandb_metrics(wandb_writer=wandb_writer, traj_lens=env_stats(traj_lens, env_idx),
+        env_metrics = prep_wandb_metrics(
+            wandb_writer=wandb_writer,
+            traj_lens=env_stats(traj_lens, env_idx),
             turn_lens=env_stats(turn_lens, env_idx),
             rewards=env_stats(rewards, env_idx),
             num_turns=env_stats(num_turns, env_idx),
@@ -1177,7 +1222,11 @@ def maybe_log_training_metrics(
 
 
 def prepare_trajectories(
-    rollouts: Rollouts, tokenizer: MegatronTokenizer, seq_length: int, sequence_packing: bool, skip_bos_token: bool
+    rollouts: Rollouts,
+    tokenizer: MegatronTokenizer,
+    seq_length: int,
+    sequence_packing: bool,
+    skip_bos_token: bool,
 ):
     """Pad trajectories and extract the generation masks.
     Args:
@@ -1197,7 +1246,7 @@ def prepare_trajectories(
     DEFAULT_PAD_TOKENS = ['<|finetune_right_pad_id|>', '<SPECIAL_999>']
 
     if tokenizer.library == "huggingface":
-        tokenizer : HuggingFaceTokenizer
+        tokenizer: HuggingFaceTokenizer
         if not tokenizer.pad:
             for pad_token in DEFAULT_PAD_TOKENS:
                 if pad_token in tokenizer._tokenizer.tokenizer.get_vocab():
@@ -1239,7 +1288,9 @@ def prepare_trajectories(
         )
         for turn_idx, trajectory in enumerate(all_turns_trajectories):
             inf_logprobs = rollout.logprobs[turn_idx]
-            generation_mask = rollout.generation_mask[turn_idx] if isinstance(rollout, TokenRollout) else None
+            generation_mask = (
+                rollout.generation_mask[turn_idx] if isinstance(rollout, TokenRollout) else None
+            )
             length = len(trajectory)
             assert length <= seq_length, "Rollout too long, how did this happen?"
             if len(trajectory) < seq_length:
@@ -1289,7 +1340,7 @@ def prepare_trajectories(
     else:
         assert (
             tokenizer.bos is None or (trajs[:, 0] != tokenizer.bos).all()
-        ), "First token should not be bos"  
+        ), "First token should not be bos"
     assert (
         tokenizer.bos is None or (trajs[:, 1] != tokenizer.bos).all()
     ), "Second token should not be bos"
@@ -1311,7 +1362,7 @@ def logprobs_forward_step(data_iterator, model, is_correction, packing_context=N
     if packing_context is not None:
         # When using sequence packing, the data iterator returns a tuple with a single element, the bin index.
         bin_tensor = next(data_iterator)[0]
-        #TODO(jalbericiola): change for named tuple
+        # TODO(jalbericiola): change for named tuple
         (b_trajs, _, _, _, b_posids, _, _, _, _, _, b_packed_seq_params) = (
             load_packed_data_by_index(bin_tensor.item(), packing_context, is_correction)
         )
@@ -1339,7 +1390,7 @@ def compute_logprobs_batch(
     data_loader,
     forward_backward_func,
     packing_context,
-    trajs_batch_size, # n_bins for seq packing, and batch_size for non seq packing
+    trajs_batch_size,  # n_bins for seq packing, and batch_size for non seq packing
     seq_length,
     logprobs_batch_size,
     decoder_seq_length,
@@ -1353,7 +1404,9 @@ def compute_logprobs_batch(
     data_iterator = iter(data_loader)
     for i in range(len(data_loader)):
         output_tensor = forward_backward_func(
-            forward_step_func=partial(logprobs_forward_step, is_correction=is_correction, packing_context=packing_context),
+            forward_step_func=partial(
+                logprobs_forward_step, is_correction=is_correction, packing_context=packing_context
+            ),
             data_iterator=data_iterator,
             model=model,
             num_microbatches=1,
@@ -1372,10 +1425,7 @@ def compute_logprobs_batch(
         assert logprobs.dtype == dtype
     else:
         logprobs = torch.empty(
-            trajs_batch_size,
-            seq_length-1,
-            dtype=dtype,
-            device=torch.cuda.current_device(),
+            trajs_batch_size, seq_length - 1, dtype=dtype, device=torch.cuda.current_device()
         )
 
     # Only PP>1 needs a broadcast from the last stage; for PP=1 the output is already local.
@@ -1422,12 +1472,14 @@ def prepare_data_for_update(
         with nvtx_range("rl/compute-group-stats", time=True):
             group_stats = compute_group_stats(rollouts, tokenizer, args.seq_length)
             # TODO(vitalyk): why do we need global_advantages here? go inside packing
-            advantages = global_advantages = torch.tensor(group_stats.advantages, dtype=dtype).cuda()
+            advantages = global_advantages = torch.tensor(
+                group_stats.advantages, dtype=dtype
+            ).cuda()
 
         # Now split the rollouts across the data parallel ranks for training
         # This needs to be done at this point because we are about to calculate logprobs
-        # Note :- For EP, do not use the expert data parallel group here. Always 
-        # use the regular data parallel group. 
+        # Note :- For EP, do not use the expert data parallel group here. Always
+        # use the regular data parallel group.
 
         # Get example group per environment to log their rollouts.
         example_groups = {}
@@ -1443,7 +1495,9 @@ def prepare_data_for_update(
         total_turns_sampled = len(rollouts)
 
         # We might sample more than we consume in one step.
-        samples_ratio_per_step = args.global_batch_size / (args.grpo_prompts_per_step * args.grpo_group_size)
+        samples_ratio_per_step = args.global_batch_size / (
+            args.grpo_prompts_per_step * args.grpo_group_size
+        )
         assert samples_ratio_per_step <= 1, "You cannot use more data than you sampled."
 
         if (data_parallel_world_size := mpu.get_data_parallel_world_size()) > 0:
@@ -1454,8 +1508,8 @@ def prepare_data_for_update(
             )
             rollouts = rollouts[data_split_range[0] : data_split_range[1]]
             local_num_turns = sum(num_turns[data_split_range[0] : data_split_range[1]])
-            steps_before = sum(num_turns[:data_split_range[0]])
-            advantages = advantages[steps_before:steps_before+local_num_turns]
+            steps_before = sum(num_turns[: data_split_range[0]])
+            advantages = advantages[steps_before : steps_before + local_num_turns]
             # First we calculate them on a global level and then we split and recalculate on a local level.
             # Sequence packing and reporting needs it global but non-packing wants it local.
 
@@ -1469,15 +1523,15 @@ def prepare_data_for_update(
         if sequence_packing:
             with nvtx_range("rl/sequence-packing", time=True):
                 runtime_state.packing_context = packing_context = pack_all_trajectories(
-                    trajs, 
-                    generation_masks, 
-                    inference_logprobs, 
-                    global_advantages, 
-                    args.seq_length, 
+                    trajs,
+                    generation_masks,
+                    inference_logprobs,
+                    global_advantages,
+                    args.seq_length,
                     args.rl_sequence_packing_max_sequences_per_bin,
-                    args.rl_sequence_packing_algo
-                    )
-    
+                    args.rl_sequence_packing_algo,
+                )
+
                 compute_trajs = packing_context.packed_trajs
                 compute_position_ids = packing_context.packed_position_ids
                 # Use batch_size=1 for packed computation to enable proper attention masking
@@ -1506,19 +1560,19 @@ def prepare_data_for_update(
                 )
                 logprobs_batch_size = args.micro_batch_size
 
-
         with torch.no_grad(), nvtx_range("rl/compute-logprobs", time=True):
             # Before we can update the model, we need to get the logprobs for the \pi_{old} model.
 
             forward_backward_func = get_forward_backward_func()
-            if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope:
+            if (
+                args.cuda_graph_impl == "local"
+                and CudaGraphScope.full_iteration in args.cuda_graph_scope
+            ):
                 forward_backward_func = FullCudaGraphWrapper(
                     forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps
                 )
 
-            dtype = (
-                torch.bfloat16 if args.bf16 else (torch.float16 if args.fp16 else torch.float32)
-            )
+            dtype = torch.bfloat16 if args.bf16 else (torch.float16 if args.fp16 else torch.float32)
 
             pg_collection = get_attr_wrapped_model(model, "pg_collection")
             pp_group = pg_collection.pp
@@ -1566,7 +1620,6 @@ def prepare_data_for_update(
                     gc.collect()
                     torch.cuda.empty_cache()
 
-
         if sequence_packing:
             with nvtx_range("rl/pack-logprobs", time=True):
                 # Store logprobs on gpu in packing context
@@ -1595,7 +1648,9 @@ def prepare_data_for_update(
                     # Store packed inference logprobs in packing context
                     packing_context.packed_inference_logprobs = packed_inference_logprobs.cuda()
                     # Only mark as having inference logprobs for IS correction if enabled
-                    packing_context.has_inference_logprobs = args.rl_inference_logprobs_is_correction
+                    packing_context.has_inference_logprobs = (
+                        args.rl_inference_logprobs_is_correction
+                    )
             with nvtx_range("rl/create-dataloader", time=True):
                 # @vitalyk: This function also reconfigures the data loader to count the
                 # global_batch_size in the bins frame of reference.
@@ -1604,13 +1659,15 @@ def prepare_data_for_update(
 
                 update_microbatch_calculator(
                     samples_ratio_per_step=samples_ratio_per_step,
-                    num_bins_this_rank = len(packing_context.packed_trajs),
-                    bin_seq_indices = packing_context.packing_info.bin_seq_indices,
+                    num_bins_this_rank=len(packing_context.packed_trajs),
+                    bin_seq_indices=packing_context.packing_info.bin_seq_indices,
                     global_batch_size=args.global_batch_size,
                     micro_batch_size=args.micro_batch_size,
                     decrease_batch_size_if_needed=args.decrease_batch_size_if_needed,
-               )
-                loader = get_microbatch_dataloader(len(packing_context.packed_trajs), args.micro_batch_size)
+                )
+                loader = get_microbatch_dataloader(
+                    len(packing_context.packed_trajs), args.micro_batch_size
+                )
         else:
             with nvtx_range("rl/align-inference-logprobs", time=True):
                 if inference_logprobs is not None:
@@ -1632,7 +1689,7 @@ def prepare_data_for_update(
 
                 reconfigure_num_microbatches_calculator(
                     rank=torch.distributed.get_rank() if torch.distributed.is_initialized() else 0,
-                    global_batch_size=math.ceil(samples_ratio_per_step*total_turns_sampled),
+                    global_batch_size=math.ceil(samples_ratio_per_step * total_turns_sampled),
                     micro_batch_size=args.micro_batch_size,
                     decrease_batch_size_if_needed=args.decrease_batch_size_if_needed,
                     data_parallel_size=mpu.get_data_parallel_world_size(),
@@ -1699,10 +1756,8 @@ def get_grpo_data_iterator(
 
     # We collect new rollouts when we've gone over the collected data 'grpo_iterations' times.
     global_batches_per_collection = (grpo_prompts_per_step * grpo_group_size) // global_batch_size
-    if (
-        buffered_rollouts is None or
-        iteration == runtime_state.last_collection_iteration +
-        (grpo_iterations * global_batches_per_collection)
+    if buffered_rollouts is None or iteration == runtime_state.last_collection_iteration + (
+        grpo_iterations * global_batches_per_collection
     ):
 
         rollouts = get_environment_rollouts(
@@ -2004,11 +2059,14 @@ def megatron_rl_inference_mode(
             with nvtx_range("rl/offload-optimizer-before-inference", time=True):
                 if not args.rl_training_cuda_graphs:
                     with nvtx_range("rl/offload/grad-buffers", time=True):
-                        model_for_grad_offload = training_model if training_model is not None else model
+                        model_for_grad_offload = (
+                            training_model if training_model is not None else model
+                        )
                         model_for_grad_offload[0].offload_grad_buffers()
                 else:
                     logger.warning(
-                        "Gradient buffers will not be offloaded when training cudagraphs are enabled!")
+                        "Gradient buffers will not be offloaded when training cudagraphs are enabled!"
+                    )
                 with nvtx_range("rl/offload/optimizer-state", time=True):
                     optimizer.offload_to_cpu()
 
@@ -2058,7 +2116,9 @@ def megatron_rl_inference_mode(
                     optimizer.restore_from_cpu()
 
         # Set training model back to train mode (not inference model if they're separate)
-        training_lang_module = unwrap_model(training_model[0]) if training_model is not None else lang_module
+        training_lang_module = (
+            unwrap_model(training_model[0]) if training_model is not None else lang_module
+        )
         training_lang_module.train()
 
         if has_lru_cache:
@@ -2087,6 +2147,7 @@ def rl_inference_interface_shutdown():
     # It seem the Flask server has non-daemon threads that are preventing the program from exiting.
     # We need to find a way to gracefully complete all in progress requests and shutdown the Flask server.
     import os
+
     os._exit(0)
 
 
@@ -2099,7 +2160,8 @@ def get_iteration_sequence_count(args):
     if torch.distributed.is_initialized():
         torch.distributed.all_reduce(sequences_tensor, group=mpu.get_data_parallel_group())
     return int(sequences_tensor.item())
-    
+
+
 def _pad_nonnull_with_zeros(data: list[Optional[torch.Tensor]], max_len: int) -> torch.Tensor:
     """Pad each element of a list of tensors to the length required.
     Args:
diff --git a/megatron/rl/sequence_packing_utils.py b/megatron/rl/sequence_packing_utils.py
index ff98b0a58e2..c78e7d39127 100644
--- a/megatron/rl/sequence_packing_utils.py
+++ b/megatron/rl/sequence_packing_utils.py
@@ -1,22 +1,24 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
-import torch
+import logging
 import math
+import typing
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Tuple
+
 import numpy as np
-from typing import List, Dict, Any, Tuple, Optional
+import torch
 from torch.utils.data import DataLoader, TensorDataset
-from dataclasses import dataclass, field
+
+from megatron.core import mpu
+from megatron.core.num_microbatches_calculator import (
+    get_num_microbatches,
+    reconfigure_num_microbatches_calculator,
+)
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.utils import log_single_rank
 from megatron.training.global_vars import get_args, get_tokenizer
 from megatron.training.utils import get_nvtx_range
-from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core import mpu
-import logging
-import typing
-from megatron.core.num_microbatches_calculator import (
-        get_num_microbatches,
-        reconfigure_num_microbatches_calculator,
-    )
 
 logger = logging.getLogger(__name__)
 
@@ -32,6 +34,7 @@ class PackingInfo:
         seq_to_bin_idx: List mapping each global sequence index to its bin index
         packing_algo: Algorithm used for distributing bins ('fifo' or 'round-robin')
     """
+
     bin_seq_indices: List[List[int]]
     seq_starts: Dict[int, List[int]]
     seq_lengths: List[int]
@@ -56,6 +59,7 @@ class PackingContext:
         bin_advantages: List of advantage tensors for each bin
         cached_packed_seq_params: Pre-computed PackedSeqParams for each bin
     """
+
     bin_size: int
     packer: 'SequencePacker'
     packing_info: PackingInfo
@@ -69,7 +73,9 @@ class PackingContext:
     cached_packed_seq_params: List[Optional[PackedSeqParams]] = field(default_factory=list)
 
 
-def load_packed_data_by_index(bin_idx: int, packing_context: PackingContext, logprobs_is_correction: bool):
+def load_packed_data_by_index(
+    bin_idx: int, packing_context: PackingContext, logprobs_is_correction: bool
+):
     """Load packed data by index.
 
     Args:
@@ -144,9 +150,7 @@ def log_packing_efficiency(packing_context: PackingContext):
     total_tokens = sum(packing_info.seq_lengths)  # All sequences
     my_sequences = sum(len(indices) for indices in my_bin_seq_indices)
     my_tokens = sum(
-        packing_info.seq_lengths[idx]
-        for indices in my_bin_seq_indices
-        for idx in indices
+        packing_info.seq_lengths[idx] for indices in my_bin_seq_indices for idx in indices
     )
     total_capacity = packed_trajs.shape[0] * packed_trajs.shape[1]
     packing_efficiency = my_tokens / total_capacity if total_capacity > 0 else 0
@@ -159,13 +163,9 @@ def log_packing_efficiency(packing_context: PackingContext):
         logging.INFO,
         f"[Sequence Packing]  - Total sequences: {len(packing_info.seq_lengths)}",
     )
+    log_single_rank(logger, logging.INFO, f"[Sequence Packing]  - Total bins: {num_bins}")
     log_single_rank(
-        logger, logging.INFO, f"[Sequence Packing]  - Total bins: {num_bins}"
-    )
-    log_single_rank(
-        logger,
-        logging.INFO,
-        f"[Sequence Packing]  - Bin size: {packed_trajs.shape[1]} tokens",
+        logger, logging.INFO, f"[Sequence Packing]  - Bin size: {packed_trajs.shape[1]} tokens"
     )
     log_single_rank(
         logger,
@@ -196,12 +196,8 @@ def log_packing_efficiency(packing_context: PackingContext):
                 float(len(my_bin_seq_indices)),  # total bins
                 float(len(non_empty_bins)),  # non-empty bins
                 float(my_sequences),  # total sequences
-                (
-                    float(min(non_empty_bins)) if non_empty_bins else 0.0
-                ),  # min sequences per bin
-                (
-                    float(max(non_empty_bins)) if non_empty_bins else 0.0
-                ),  # max sequences per bin
+                (float(min(non_empty_bins)) if non_empty_bins else 0.0),  # min sequences per bin
+                (float(max(non_empty_bins)) if non_empty_bins else 0.0),  # max sequences per bin
                 (
                     float(my_sequences / len(non_empty_bins)) if non_empty_bins else 0.0
                 ),  # avg sequences per non-empty bin
@@ -258,13 +254,10 @@ def log_packing_efficiency(packing_context: PackingContext):
             total_seqs_all_ranks = sum(int(stats[3].item()) for stats in all_rank_stats)
             avg_seqs_per_rank = total_seqs_all_ranks / world_size
             max_deviation = max(
-                abs(int(stats[3].item()) - avg_seqs_per_rank)
-                for stats in all_rank_stats
+                abs(int(stats[3].item()) - avg_seqs_per_rank) for stats in all_rank_stats
             )
             log_single_rank(
-                logger,
-                logging.INFO,
-                "[Sequence Packing]  Round-robin distribution quality:",
+                logger, logging.INFO, "[Sequence Packing]  Round-robin distribution quality:"
             )
             log_single_rank(
                 logger,
@@ -277,6 +270,7 @@ def log_packing_efficiency(packing_context: PackingContext):
                 f"[Sequence Packing]  - Max deviation from average: {max_deviation:.0f} sequences ({max_deviation/avg_seqs_per_rank*100:.1f}%)",
             )
 
+
 def get_actual_sequence_lengths(sequences: torch.Tensor, pad_token: int) -> List[int]:
     """Get actual sequence lengths for pre-padded sequences.
 
@@ -307,11 +301,11 @@ def get_actual_sequence_lengths(sequences: torch.Tensor, pad_token: int) -> List
 
 
 def create_empty_bins(
-    num_empty_bins : int,
-    bin_size : int,
-    packed_trajs : torch.Tensor,
-    packed_position_ids : torch.Tensor,
-    packed_loss_mask : torch.Tensor,
+    num_empty_bins: int,
+    bin_size: int,
+    packed_trajs: torch.Tensor,
+    packed_position_ids: torch.Tensor,
+    packed_loss_mask: torch.Tensor,
     tokenizer,
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[Dict[str, Any]]]:
     """Create empty bins for padding to ensure all ranks have the same number of bins.
@@ -368,14 +362,12 @@ def create_empty_bins(
         empty_position_ids = None
         empty_loss_mask = None
 
-    return (
-        empty_trajs,
-        empty_position_ids,
-        empty_loss_mask,
-        empty_packing_info_entries,
-    )
+    return (empty_trajs, empty_position_ids, empty_loss_mask, empty_packing_info_entries)
 
-def get_default_packed_seq_params(seq_length: int, max_sequences_per_bin: int, device: torch.device) -> PackedSeqParams:
+
+def get_default_packed_seq_params(
+    seq_length: int, max_sequences_per_bin: int, device: torch.device
+) -> PackedSeqParams:
     """Create a default PackedSeqParams that acts as no-op for a single sequence.
 
     This ensures CUDA graph signature consistency when packed_seq_params
@@ -396,7 +388,7 @@ def get_default_packed_seq_params(seq_length: int, max_sequences_per_bin: int, d
     # Pad to the maximum number of sequences in the bin for the attention kernel.
     # We add 2 to account for the initial 0 and the final bin_size.
     cu_seqlens = torch.full(
-        (max_sequences_per_bin + 2,), seq_length, dtype=torch.int32, device=device,
+        (max_sequences_per_bin + 2,), seq_length, dtype=torch.int32, device=device
     )
     cu_seqlens[0] = 0
 
@@ -411,6 +403,7 @@ def get_default_packed_seq_params(seq_length: int, max_sequences_per_bin: int, d
         total_tokens=seq_length,
     )
 
+
 def create_packed_seq_params(packing_context: PackingContext):
     cached_packed_seq_params = []
     packing_info = packing_context.packing_info
@@ -428,12 +421,13 @@ def create_packed_seq_params(packing_context: PackingContext):
         cached_packed_seq_params.append(params)
     return cached_packed_seq_params
 
+
 def create_packed_seq_params_for_bin(
     packing_info: PackingInfo,
     bin_idx: int,
     bin_size: int,
     max_sequences_per_bin: int,
-    device: torch.device
+    device: torch.device,
 ) -> Optional[PackedSeqParams]:
     """Create PackedSeqParams for a single bin to enable proper attention masking in TE.
 
@@ -471,7 +465,7 @@ def create_packed_seq_params_for_bin(
     # We add 2 to account for the initial 0 and the final bin_size.
     if len(cu_seqlens) < max_sequences_per_bin + 2:
         out = cu_seqlens.new_full((max_sequences_per_bin + 2,), bin_size)
-        out[:len(cu_seqlens)] = cu_seqlens
+        out[: len(cu_seqlens)] = cu_seqlens
         cu_seqlens = out
 
     max_seqlen = bin_size
@@ -738,21 +732,15 @@ def pack_sequences(
             seq_starts=seq_starts_dict,
             seq_lengths=seq_lengths,
             seq_to_bin_idx=seq_to_bin_idx,
-            packing_algo='fifo'
+            packing_algo='fifo',
         )
 
         seq_per_bin = [len(indices) for indices in packing_info.bin_seq_indices]
+        log_single_rank(logger, logging.DEBUG, ("Initial packing output (before distribution):"))
         log_single_rank(
-            logger, logging.DEBUG, ("Initial packing output (before distribution):")
-        )
-        log_single_rank(
-            logger,
-            logging.DEBUG,
-            f"  - Total bins created: {len(packing_info.bin_seq_indices)}",
-        )
-        log_single_rank(
-            logger, logging.DEBUG, f"  - Total sequences packed: {sum(seq_per_bin)}"
+            logger, logging.DEBUG, f"  - Total bins created: {len(packing_info.bin_seq_indices)}"
         )
+        log_single_rank(logger, logging.DEBUG, f"  - Total sequences packed: {sum(seq_per_bin)}")
         log_single_rank(
             logger,
             logging.DEBUG,
@@ -762,6 +750,7 @@ def pack_sequences(
 
         return packed_sequences, position_ids, loss_mask, packing_info
 
+
 def distribute_packed_bins(
     packed_trajs: torch.Tensor,
     packed_position_ids: torch.Tensor,
@@ -808,7 +797,6 @@ def distribute_packed_bins(
     my_bin_seq_indices = []
     my_seq_starts = {}
 
-
     # Build the local data from the global indices
     for new_idx, old_idx in enumerate(my_bin_indices):
         my_packed_trajs.append(packed_trajs[old_idx])
@@ -822,10 +810,7 @@ def distribute_packed_bins(
         torch.stack(my_packed_trajs)
         if my_packed_trajs
         else torch.empty(
-            0,
-            packed_trajs.shape[1],
-            dtype=packed_trajs.dtype,
-            device=packed_trajs.device,
+            0, packed_trajs.shape[1], dtype=packed_trajs.dtype, device=packed_trajs.device
         )
     )
     packed_position_ids = (
@@ -851,23 +836,17 @@ def distribute_packed_bins(
 
     # Debug: Check what we're extracting
     log_single_rank(logger, logging.DEBUG, (f"Rank 0 {packing_algo} bin assignment:"))
-    log_single_rank(
-        logger, logging.DEBUG, f"  - Total bins before distribution: {num_bins}"
-    )
+    log_single_rank(logger, logging.DEBUG, f"  - Total bins before distribution: {num_bins}")
     log_single_rank(
         logger,
         logging.DEBUG,
         f"  - Bins assigned to rank 0: {my_bin_indices[:10]}... (showing first 10)",
     )
     log_single_rank(
-        logger,
-        logging.DEBUG,
-        f"  - Number of bins for this rank: {len(my_bin_indices)}",
+        logger, logging.DEBUG, f"  - Number of bins for this rank: {len(my_bin_indices)}"
     )
     log_single_rank(
-        logger,
-        logging.DEBUG,
-        f"  - Length of my_bin_seq_indices: {len(my_bin_seq_indices)}",
+        logger, logging.DEBUG, f"  - Length of my_bin_seq_indices: {len(my_bin_seq_indices)}"
     )
     if len(my_bin_seq_indices) > 0:
         log_single_rank(
@@ -891,25 +870,20 @@ def distribute_packed_bins(
         num_empty_bins = max_bins_per_rank - current_bins
 
         # Create empty bins using the helper function
-        (
-            empty_trajs,
-            empty_position_ids,
-            empty_loss_mask,
-            empty_packing_entries,
-        ) = create_empty_bins(
-            num_empty_bins,
-            bin_size,
-            packed_trajs,
-            packed_position_ids,
-            packed_loss_mask,
-            tokenizer,
+        (empty_trajs, empty_position_ids, empty_loss_mask, empty_packing_entries) = (
+            create_empty_bins(
+                num_empty_bins,
+                bin_size,
+                packed_trajs,
+                packed_position_ids,
+                packed_loss_mask,
+                tokenizer,
+            )
         )
 
         # Append empty bins to packed tensors
         packed_trajs = torch.cat([packed_trajs, empty_trajs], dim=0)
-        packed_position_ids = torch.cat(
-            [packed_position_ids, empty_position_ids], dim=0
-        )
+        packed_position_ids = torch.cat([packed_position_ids, empty_position_ids], dim=0)
         packed_loss_mask = torch.cat([packed_loss_mask, empty_loss_mask], dim=0)
 
         # Add empty entries to packing_info
@@ -921,13 +895,22 @@ def distribute_packed_bins(
     return packed_trajs, packed_position_ids, packed_loss_mask, new_packing_info
 
 
-def pack_all_trajectories(trajs, generation_masks, inference_logprobs, global_advantages, bin_size, max_sequences_per_bin, packing_algo):
+def pack_all_trajectories(
+    trajs,
+    generation_masks,
+    inference_logprobs,
+    global_advantages,
+    bin_size,
+    max_sequences_per_bin,
+    packing_algo,
+):
     tokenizer = get_tokenizer()
     data_parallel_world_size = mpu.get_data_parallel_world_size()
     data_parallel_group = mpu.get_data_parallel_group()
     nvtx_range = get_nvtx_range()
 
     with nvtx_range("rl/regather-trajectories", time=True):
+
         def _gather(data):
             data = data.cuda()
             data_list = [torch.empty_like(data) for _ in range(data_parallel_world_size)]
@@ -942,31 +925,20 @@ def _gather(data):
     with nvtx_range("rl/pack-sequences", time=True):
         # Create packer with max sequences per bin limit to prevent extreme imbalance
         packer = SequencePacker(
-            bin_size=bin_size,
-            pad_token=tokenizer.pad,
-            max_sequences_per_bin=max_sequences_per_bin,
+            bin_size=bin_size, pad_token=tokenizer.pad, max_sequences_per_bin=max_sequences_per_bin
         )
 
         # Pack sequences with generation masks
-        (
-            packed_trajs,
-            packed_position_ids,
-            packed_loss_mask,
-            packing_info,
-        ) = packer.pack_sequences(trajs, generation_masks)
+        (packed_trajs, packed_position_ids, packed_loss_mask, packing_info) = packer.pack_sequences(
+            trajs, generation_masks
+        )
         packing_info.packing_algo = packing_algo
 
         # Distribute packed bins across the data parallel ranks
-        (
-            packed_trajs,
-            packed_position_ids,
-            packed_loss_mask,
-            packing_info,
-        ) = distribute_packed_bins(
-            packed_trajs,
-            packed_position_ids,
-            packed_loss_mask,
-            packing_info,
+        (packed_trajs, packed_position_ids, packed_loss_mask, packing_info) = (
+            distribute_packed_bins(
+                packed_trajs, packed_position_ids, packed_loss_mask, packing_info
+            )
         )
 
     # Create bin_advantages list
@@ -984,12 +956,13 @@ def _gather(data):
     # Create a temporary packing context to pass to create_packed_seq_params
     cached_packed_seq_params = [
         create_packed_seq_params_for_bin(
-                packing_info=packing_info,
-                bin_idx=bin_idx,
-                bin_size=bin_size,
-                max_sequences_per_bin=max_sequences_per_bin,
-                device=packed_trajs.device,
-            ) for bin_idx in range(len(packed_trajs))
+            packing_info=packing_info,
+            bin_idx=bin_idx,
+            bin_size=bin_size,
+            max_sequences_per_bin=max_sequences_per_bin,
+            device=packed_trajs.device,
+        )
+        for bin_idx in range(len(packed_trajs))
     ]
 
     # Create the final PackingContext
@@ -1011,6 +984,7 @@ def _gather(data):
 
     return packing_context
 
+
 def update_microbatch_calculator(
     samples_ratio_per_step: float,
     num_bins_this_rank: int,
@@ -1051,9 +1025,7 @@ def update_microbatch_calculator(
     )
     new_num_microbatches = get_num_microbatches()
 
-    log_single_rank(
-        logger, logging.INFO, "[Sequence Packing] Multi-step training plan:"
-    )
+    log_single_rank(logger, logging.INFO, "[Sequence Packing] Multi-step training plan:")
 
     log_single_rank(
         logger,
@@ -1095,10 +1067,14 @@ def update_microbatch_calculator(
     if opt_steps > 3:
         log_single_rank(logger, logging.INFO, f"  - ... ({opt_steps - 3} more steps)")
 
+
 def get_microbatch_dataloader(num_bins_this_rank, micro_batch_size):
     bin_indices = torch.arange(num_bins_this_rank)
     dataset = TensorDataset(bin_indices)
-    return DataLoader(dataset, batch_size=micro_batch_size, shuffle=False, collate_fn=lambda x: x[0])
+    return DataLoader(
+        dataset, batch_size=micro_batch_size, shuffle=False, collate_fn=lambda x: x[0]
+    )
+
 
 def get_sequence_packing_log_info(args):
     """Get logging information for sequence packing mode."""
diff --git a/megatron/training/activation_logging.py b/megatron/training/activation_logging.py
index 8f790a116c9..97f69e20b09 100644
--- a/megatron/training/activation_logging.py
+++ b/megatron/training/activation_logging.py
@@ -2,11 +2,11 @@
 
 """Forward activation logging using forward hooks."""
 
-from collections import defaultdict
 import json
 import logging
 import os
 import re
+from collections import defaultdict
 from typing import Callable, List, Tuple
 
 import torch
@@ -32,26 +32,34 @@ def _discover_te_types():
 
     try:
         from megatron.core.extensions.transformer_engine import (
+            TEColumnParallelLinear,
+            TELayerNormColumnParallelLinear,
             TELinear,
             TENorm,
-            TEColumnParallelLinear,
             TERowParallelLinear,
-            TELayerNormColumnParallelLinear,
         )
-        all_types.extend([TELinear, TENorm, TEColumnParallelLinear, TERowParallelLinear,
-                          TELayerNormColumnParallelLinear])
+
+        all_types.extend(
+            [
+                TELinear,
+                TENorm,
+                TEColumnParallelLinear,
+                TERowParallelLinear,
+                TELayerNormColumnParallelLinear,
+            ]
+        )
     except ImportError:
         pass
 
     try:
         from megatron.core.extensions.transformer_engine import (
-            TEGroupedLinear,
             TEColumnParallelGroupedLinear,
+            TEGroupedLinear,
             TERowParallelGroupedLinear,
         )
+
         if TEGroupedLinear is not None:
-            grouped = [TEGroupedLinear, TEColumnParallelGroupedLinear,
-                       TERowParallelGroupedLinear]
+            grouped = [TEGroupedLinear, TEColumnParallelGroupedLinear, TERowParallelGroupedLinear]
             all_types.extend(grouped)
             grouped_types.extend(grouped)
     except ImportError:
@@ -62,8 +70,14 @@ def _discover_te_types():
 
 _TE_TYPES, _GROUPED_LINEAR_TYPES = _discover_te_types()
 
-LINEAR_TYPES = (nn.Linear, nn.Embedding, ColumnParallelLinear, RowParallelLinear,
-                Router, *_TE_TYPES)
+LINEAR_TYPES = (
+    nn.Linear,
+    nn.Embedding,
+    ColumnParallelLinear,
+    RowParallelLinear,
+    Router,
+    *_TE_TYPES,
+)
 
 
 def _register_hooks(model, module_types, hook_factory, *, name_filter=None):
@@ -83,7 +97,9 @@ def _register_hooks(model, module_types, hook_factory, *, name_filter=None):
         model_chunk_name = f"model_chunk{model_chunk_id}"
         unwrapped = unwrap_model(model_chunk)
         for module_name, module in unwrapped.named_modules():
-            if isinstance(module, module_types) and (name_filter is None or name_filter(module_name)):
+            if isinstance(module, module_types) and (
+                name_filter is None or name_filter(module_name)
+            ):
                 hook_fn = hook_factory(model_chunk_name, module_name)
                 if hook_fn is None:
                     continue
@@ -91,6 +107,7 @@ def _register_hooks(model, module_types, hook_factory, *, name_filter=None):
                 handles.append(handle)
     return handles
 
+
 class ActivationLogger:
     """Captures and saves forward activations using forward hooks.
 
@@ -127,14 +144,18 @@ def hook(_, args, kwargs, output):
                 if inp is None:
                     continue
                 key = f"{module_name}/input{idx}"
-                sd[model_chunk_name][key] = inp.detach().cpu() if isinstance(inp, torch.Tensor) else inp
+                sd[model_chunk_name][key] = (
+                    inp.detach().cpu() if isinstance(inp, torch.Tensor) else inp
+                )
             for idx, out in enumerate(output if isinstance(output, tuple) else (output,)):
                 if out is not None and isinstance(out, torch.Tensor):
                     sd[model_chunk_name][f"{module_name}/output{idx}"] = out.detach().cpu()
             for kwarg_key, kwarg_value in kwargs.items():
                 key = f"{module_name}/{kwarg_key}"
                 sd[model_chunk_name][key] = (
-                    kwarg_value.detach().cpu() if isinstance(kwarg_value, torch.Tensor) else kwarg_value
+                    kwarg_value.detach().cpu()
+                    if isinstance(kwarg_value, torch.Tensor)
+                    else kwarg_value
                 )
 
         return hook
@@ -168,7 +189,8 @@ def _make_tpe_hook(self, _model_chunk_name: str, module_name: str) -> Callable:
         if not m:
             logger.warning(
                 "Cannot extract layer number from module name: %r — "
-                "skipping tokens-per-expert hook for this module", module_name
+                "skipping tokens-per-expert hook for this module",
+                module_name,
             )
             return None
         layer = m.group(1)
@@ -185,7 +207,9 @@ def hook(_, args, kwargs, output):
     def register_tpe_hooks(self, model):
         assert not self._tpe_hooks
         self._tpe_hooks = _register_hooks(
-            model, _GROUPED_LINEAR_TYPES, self._make_tpe_hook,
+            model,
+            _GROUPED_LINEAR_TYPES,
+            self._make_tpe_hook,
             name_filter=lambda name: name.endswith("linear_fc1"),
         )
 
@@ -209,14 +233,14 @@ def save_tpe(self, iteration: int):
         os.makedirs(tpe_dir, exist_ok=True)
         filepath = os.path.join(tpe_dir, f"rank{rank}.jsonl")
         lines = "".join(
-            json.dumps({"iter": iteration, "layer": int(layer),
-                        "tpe": microbatches}) + "\n"
+            json.dumps({"iter": iteration, "layer": int(layer), "tpe": microbatches}) + "\n"
             for layer, microbatches in sorted(self._tpe_records.items())
         )
         with open(filepath, "a") as f:
             f.write(lines)
         self._tpe_records.clear()
 
+
 _LOGGER: ActivationLogger | None = None
 
 
@@ -234,6 +258,7 @@ def _require_logger() -> ActivationLogger:
 
 # -- Full activation logging -------------------------------------------
 
+
 def enable_activation_logging(model: torch.nn.Module, save_dir: str):
     _get_logger(save_dir).register_activation_hooks(model)
 
@@ -248,6 +273,7 @@ def save_activations(iteration: int):
 
 # -- Tokens-per-expert logging ----------------------------------------
 
+
 def enable_tokens_per_expert_logging(model: torch.nn.Module, save_dir: str):
     _get_logger(save_dir).register_tpe_hooks(model)
 
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
index ba4099c3b71..6a108a0d6d0 100644
--- a/megatron/training/arguments.py
+++ b/megatron/training/arguments.py
@@ -6,49 +6,48 @@
 import dataclasses
 import json
 import os
-from pathlib import Path
 import re
 import types
+from pathlib import Path
 
 import torch
 import torch.nn.functional as F
 from packaging.version import Version as PkgVersion
 
+from megatron.core.activations import squared_relu
 from megatron.core.dist_checkpointing.validation import StrictHandling
+from megatron.core.fusions.fused_bias_geglu import quick_gelu
+from megatron.core.msc_utils import MultiStorageClientFeature
+from megatron.core.quantization.utils import (
+    kitchen_quantization_recipe_config,
+    load_quantization_recipe,
+)
 from megatron.core.rerun_state_machine import RerunStateMachine
 from megatron.core.transformer import MLATransformerConfig, TransformerConfig
-from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout
 from megatron.core.transformer.enums import AttnBackend, CudaGraphScope
 from megatron.core.transformer.heterogeneous.heterogeneous_config import (
     HeterogeneousTransformerConfig,
     MLPConfig,
 )
+from megatron.core.transformer.pipeline_parallel_layer_layout import PipelineParallelLayerLayout
 from megatron.core.utils import (
     get_torch_version,
     is_flashinfer_min_version,
     is_te_min_version,
     is_torch_min_version,
 )
-from megatron.core.activations import squared_relu
-from megatron.core.fusions.fused_bias_geglu import quick_gelu
+from megatron.training.argument_utils import ArgumentGroupFactory
 from megatron.training.global_vars import set_global_variables
 from megatron.training.utils import (
     get_device_arch_version,
-    update_use_dist_ckpt,
     print_rank_0,
+    update_use_dist_ckpt,
     warn_rank_0,
 )
-from megatron.core.msc_utils import MultiStorageClientFeature
-
-from megatron.core.quantization.utils import (
-    kitchen_quantization_recipe_config,
-    load_quantization_recipe,
-)
 
-from megatron.training.argument_utils import ArgumentGroupFactory
 
 def add_megatron_arguments(parser: argparse.ArgumentParser):
-    """"Add Megatron-LM arguments to the given parser."""
+    """ "Add Megatron-LM arguments to the given parser."""
 
     # Standard arguments.
     parser = _add_network_size_args(parser)
@@ -86,13 +85,16 @@ def add_megatron_arguments(parser: argparse.ArgumentParser):
 
     return parser
 
+
 def parse_and_validate_args(extra_args_provider=None, ignore_unknown_args=False, args_defaults={}):
     args = parse_args(extra_args_provider, ignore_unknown_args)
 
     if args.use_checkpoint_args or args_defaults.get("use_checkpoint_args", False):
         from megatron.training.checkpointing import load_args_from_checkpoint
 
-        assert args.load is not None or args.pretrained_checkpoint is not None, "--use-checkpoint-args requires --load or --pretrained-checkpoint argument"
+        assert (
+            args.load is not None or args.pretrained_checkpoint is not None
+        ), "--use-checkpoint-args requires --load or --pretrained-checkpoint argument"
         assert args.non_persistent_ckpt_type != "local", (
             "--use-checkpoint-args is not supported with --non_persistent_ckpt_type=local. "
             "Two-stage checkpoint loading is not implemented, and all arguments must be defined "
@@ -117,8 +119,7 @@ def parse_and_validate_args(extra_args_provider=None, ignore_unknown_args=False,
 
 def parse_args(extra_args_provider=None, ignore_unknown_args=False):
     """Parse all arguments."""
-    parser = argparse.ArgumentParser(description='Megatron-LM Arguments',
-                                     allow_abbrev=False)
+    parser = argparse.ArgumentParser(description='Megatron-LM Arguments', allow_abbrev=False)
 
     parser = add_megatron_arguments(parser)
 
@@ -137,10 +138,11 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False):
     # Experimental yaml
     if args.yaml_cfg is not None:
         from .yaml_arguments import load_yaml
-        assert args.yaml_cfg and not args.use_legacy_models, \
-            "Yaml config is not supported with legacy models."
-        args = load_yaml(args.yaml_cfg)
 
+        assert (
+            args.yaml_cfg and not args.use_legacy_models
+        ), "Yaml config is not supported with legacy models."
+        args = load_yaml(args.yaml_cfg)
 
     # Args from environment
     args.rank = int(os.getenv('RANK', '0'))
@@ -184,17 +186,22 @@ def validate_model_config_args_from_heterogeneous_config(args):
             args.heterogeneous_layers_config_path
         ).read_text()
 
-    hf_config_dict = types.SimpleNamespace(**json.loads(args.heterogeneous_layers_config_encoded_json))
-
-    assert hf_config_dict.hidden_act == "silu", (
-        f"hidden_act in heterogeneous config is {hf_config_dict.hidden_act}, should be silu"
+    hf_config_dict = types.SimpleNamespace(
+        **json.loads(args.heterogeneous_layers_config_encoded_json)
     )
 
+    assert (
+        hf_config_dict.hidden_act == "silu"
+    ), f"hidden_act in heterogeneous config is {hf_config_dict.hidden_act}, should be silu"
+
     n_kv_heads_in_group = [
-        config["attention"]["n_heads_in_group"] for config in hf_config_dict.block_configs
+        config["attention"]["n_heads_in_group"]
+        for config in hf_config_dict.block_configs
         if config["attention"]["n_heads_in_group"] is not None
     ]
-    assert all(num == n_kv_heads_in_group[0] for num in n_kv_heads_in_group), "num query head must be consistent across all layers"
+    assert all(
+        num == n_kv_heads_in_group[0] for num in n_kv_heads_in_group
+    ), "num query head must be consistent across all layers"
 
     args_to_validate = {
         "swiglu": True,
@@ -220,16 +227,17 @@ def validate_model_config_args_from_heterogeneous_config(args):
             incompatible_args[key] = (provided_value, value)
 
     if incompatible_args:
-        incompatible_args_str = ', '.join([
-            f"{k}: {provided_value} (provided) != {value} (expected)"
-            for k, (provided_value, value) in incompatible_args.items()
-        ])
-        raise ValueError(
-            f"Arguments differ from heterogeneous config: {incompatible_args_str}"
+        incompatible_args_str = ', '.join(
+            [
+                f"{k}: {provided_value} (provided) != {value} (expected)"
+                for k, (provided_value, value) in incompatible_args.items()
+            ]
         )
+        raise ValueError(f"Arguments differ from heterogeneous config: {incompatible_args_str}")
+
 
 def _eval_pattern(pattern):
-    """ Validate and evaluate a string containing a Python list expression """
+    """Validate and evaluate a string containing a Python list expression"""
     assert isinstance(pattern, str)
 
     # validate input, only allow comma, digits, [, ], (, ), +, and *
@@ -238,8 +246,9 @@ def _eval_pattern(pattern):
 
     return eval(pattern)
 
+
 def no_rope_freq_type(x):
-    """ Controls which layers to skip performing Rotary Position Embedding.
+    """Controls which layers to skip performing Rotary Position Embedding.
     - An integer N: Represents a 1:N ratio, meaning RoPE is skipped every N-1 layers.
     - A string "N": Same as above, but provided as a string
     - A string containing a Python list expression that defines a custom pattern, e.g.:
@@ -261,6 +270,22 @@ def no_rope_freq_type(x):
         # it's a single int but in str
         return int(x)
 
+
+def compress_ratios_type(x):
+    """Per-layer compress ratios for compressed sparse attention.
+
+    Accepts a string containing a Python list expression, e.g.:
+      "[0,0,4,128,4,128]"
+      "([0]+[4,128]*2)*3"
+    The result must be a list of integers. Each value represents the
+    compression ratio for the corresponding transformer layer.
+    """
+    if isinstance(x, list):
+        return x
+    assert isinstance(x, str)
+    return _eval_pattern(x)
+
+
 def moe_freq_type(x):
     """Frequency between MoE layers and Dense layers.
 
@@ -286,6 +311,7 @@ def moe_freq_type(x):
         # it's a single int but in str
         return int(x)
 
+
 def la_freq_type(x):
     """Frequency between LA (linear attention) layers and SDPA (scaled dot-product attention) layers.
 
@@ -311,6 +337,7 @@ def la_freq_type(x):
         # it's a single int but in str
         return int(x)
 
+
 def tuple_type(x):
     """
     Convert a string to a tuple of integers.
@@ -323,6 +350,7 @@ def tuple_type(x):
     assert isinstance(x, str)
     return tuple(int(i) for i in x.strip('()').split(','))
 
+
 def validate_args(args, defaults={}):
 
     # Prep for checkpoint conversion.
@@ -332,12 +360,16 @@ def validate_args(args, defaults={}):
         args.exit_on_missing_checkpoint = True
 
     # Temporary
-    assert args.non_persistent_ckpt_type in ['global', 'local', None], \
-        'Currently only global and local checkpoints are supported'
+    assert args.non_persistent_ckpt_type in [
+        'global',
+        'local',
+        None,
+    ], 'Currently only global and local checkpoints are supported'
     if args.non_persistent_ckpt_type == 'local':
         try:
-            from nvidia_resiliency_ext.checkpointing.local.ckpt_managers.local_manager import \
-                LocalCheckpointManager
+            from nvidia_resiliency_ext.checkpointing.local.ckpt_managers.local_manager import (
+                LocalCheckpointManager,
+            )
         except ModuleNotFoundError as e:
             raise RuntimeError('nvidia_resiliency_ext is required for local checkpointing') from e
 
@@ -346,24 +378,35 @@ def validate_args(args, defaults={}):
 
     # Set args.use_dist_ckpt from args.ckpt_format.
     if args.use_legacy_models:
-        assert args.ckpt_format == "torch", \
-            "legacy model format only supports the 'torch' checkpoint format."
+        assert (
+            args.ckpt_format == "torch"
+        ), "legacy model format only supports the 'torch' checkpoint format."
     update_use_dist_ckpt(args)
 
-    total_model_size = args.tensor_model_parallel_size * args.pipeline_model_parallel_size * args.context_parallel_size
+    total_model_size = (
+        args.tensor_model_parallel_size
+        * args.pipeline_model_parallel_size
+        * args.context_parallel_size
+    )
 
     # Total model size.
-    assert args.world_size % total_model_size == 0, (
-        f"world size ({args.world_size}) is not divisible by total_model_size ({total_model_size=})"
-    )
+    assert (
+        args.world_size % total_model_size == 0
+    ), f"world size ({args.world_size}) is not divisible by total_model_size ({total_model_size=})"
 
     if args.attention_backend == AttnBackend.local:
-        assert args.spec[0] == 'local' , '--attention-backend local is only supported with --spec local'
+        assert (
+            args.spec[0] == 'local'
+        ), '--attention-backend local is only supported with --spec local'
 
     # Pipeline model parallel size.
     args.transformer_pipeline_model_parallel_size = args.pipeline_model_parallel_size
 
-    total_model_size = args.tensor_model_parallel_size * args.pipeline_model_parallel_size * args.context_parallel_size
+    total_model_size = (
+        args.tensor_model_parallel_size
+        * args.pipeline_model_parallel_size
+        * args.context_parallel_size
+    )
     args.data_parallel_size = args.world_size // total_model_size
 
     if args.perform_rl_step:
@@ -397,13 +440,13 @@ def validate_args(args, defaults={}):
         # ----------------------------------------------------------------
 
         # Persisting CGs only makes sense if we build any CGs.
-        assert not args.rl_persist_cuda_graphs or args.cuda_graph_impl != "none", (
-            "--rl-persist-cuda-graphs is set but no CUDA graphs are being built."
-        )
+        assert (
+            not args.rl_persist_cuda_graphs or args.cuda_graph_impl != "none"
+        ), "--rl-persist-cuda-graphs is set but no CUDA graphs are being built."
         # Training CGs only makes sense if we build any CGs.
-        assert not args.rl_training_cuda_graphs or args.cuda_graph_impl != "none", (
-            "--rl-training-cuda-graphs is set but no CUDA graphs are being built."
-        )
+        assert (
+            not args.rl_training_cuda_graphs or args.cuda_graph_impl != "none"
+        ), "--rl-training-cuda-graphs is set but no CUDA graphs are being built."
         # If CUDA graphs persist and KV cache memory address is not static, we need
         # either UVM or torch_memory_saver to maintain memory address stability for CGs.
         if args.rl_persist_cuda_graphs and args.rl_kv_cache_management_mode != "persist":
@@ -417,8 +460,8 @@ def validate_args(args, defaults={}):
 
         # Offload mode requires CG persistence: CG recapture runs dummy forward
         # passes that corrupt the preserved KV data.
-        assert (
-            (not args.rl_kv_cache_management_mode == "offload") or (args.rl_persist_cuda_graphs)
+        assert (not args.rl_kv_cache_management_mode == "offload") or (
+            args.rl_persist_cuda_graphs
         ), "--rl-kv-cache-management-mode=offload requires --rl-persist-cuda-graphs"
 
         # There's no need to manually offload the KV cache with UVM.
@@ -427,7 +470,7 @@ def validate_args(args, defaults={}):
             and args.rl_kv_cache_management_mode == "offload"
         ), "--rl-kv-cache-management-mode=offload is incompatible with UVM"
         # We currently cannot recapture CGs in offload mode.
-        assert not(
+        assert not (
             not args.rl_persist_cuda_graphs and args.rl_kv_cache_management_mode == "offload"
         ), "Cannot recapture CUDA graphs while offloading KV cache."
 
@@ -445,91 +488,115 @@ def validate_args(args, defaults={}):
                     )
 
         # Resolve deprecated --rl-parallel-generation-tasks -> --rl-num-parallel-generations.
-        assert args.rl_num_parallel_generations is None \
-            or args.rl_parallel_generation_tasks is None, \
-            "Cannot specify both --rl-num-parallel-generations and " \
-            "--rl-parallel-generation-tasks. Use --rl-num-parallel-generations " \
+        assert (
+            args.rl_num_parallel_generations is None or args.rl_parallel_generation_tasks is None
+        ), (
+            "Cannot specify both --rl-num-parallel-generations and "
+            "--rl-parallel-generation-tasks. Use --rl-num-parallel-generations "
             "(--rl-parallel-generation-tasks is deprecated)."
+        )
         if args.rl_parallel_generation_tasks is not None:
             print_rank_0(
                 "WARNING: --rl-parallel-generation-tasks is deprecated, "
-                "use --rl-num-parallel-generations instead.")
+                "use --rl-num-parallel-generations instead."
+            )
             args.rl_num_parallel_generations = (
-                args.rl_parallel_generation_tasks * args.grpo_group_size)
+                args.rl_parallel_generation_tasks * args.grpo_group_size
+            )
 
         # Resolve --rl-num-parallel-generations / --rl-num-parallel-generation-batches.
-        assert args.rl_num_parallel_generations is None \
-            or args.rl_num_parallel_generation_batches is None, \
-            "--rl-num-parallel-generations and --rl-num-parallel-generation-batches " \
+        assert (
+            args.rl_num_parallel_generations is None
+            or args.rl_num_parallel_generation_batches is None
+        ), (
+            "--rl-num-parallel-generations and --rl-num-parallel-generation-batches "
             "are mutually exclusive."
+        )
         if args.rl_num_parallel_generations is not None:
-            assert args.rl_partial_rollouts, \
-                "--rl-num-parallel-generations requires --rl-partial-rollouts."
-            assert args.rl_num_parallel_generations % args.grpo_group_size == 0, \
-                f"--rl-num-parallel-generations ({args.rl_num_parallel_generations}) " \
+            assert (
+                args.rl_partial_rollouts
+            ), "--rl-num-parallel-generations requires --rl-partial-rollouts."
+            assert args.rl_num_parallel_generations % args.grpo_group_size == 0, (
+                f"--rl-num-parallel-generations ({args.rl_num_parallel_generations}) "
                 f"must be divisible by --grpo-group-size ({args.grpo_group_size})."
+            )
             args.rl_parallel_generation_tasks = (
-                args.rl_num_parallel_generations // args.grpo_group_size)
+                args.rl_num_parallel_generations // args.grpo_group_size
+            )
             if args.rl_generation_batch_size is None:
                 args.rl_generation_batch_size = 1
         elif args.rl_num_parallel_generation_batches is not None:
-            assert args.rl_partial_rollouts, \
-                "--rl-num-parallel-generation-batches requires --rl-partial-rollouts."
+            assert (
+                args.rl_partial_rollouts
+            ), "--rl-num-parallel-generation-batches requires --rl-partial-rollouts."
             if args.rl_generation_batch_size is None:
                 args.rl_generation_batch_size = args.grpo_prompts_per_step
             args.rl_parallel_generation_tasks = (
-                args.rl_num_parallel_generation_batches * args.rl_generation_batch_size)
+                args.rl_num_parallel_generation_batches * args.rl_generation_batch_size
+            )
         else:
             if args.rl_generation_batch_size is None:
                 args.rl_generation_batch_size = 1
             args.rl_parallel_generation_tasks = 512
 
         # Derive enforce_order after all resolution is complete.
-        args.rl_enforce_generation_order = (args.rl_generation_batch_size > 1)
+        args.rl_enforce_generation_order = args.rl_generation_batch_size > 1
 
         args.grpo_samples_per_iteration = args.grpo_prompts_per_step * args.grpo_group_size
 
         if args.rl_use_sequence_packing:
-            assert args.micro_batch_size == 1, \
-                "micro_batch_size must be 1 when using sequence packing. To increase compute per micro batch increase the sequence length."
-
-    print_rank_0('using world size: {}, data-parallel size: {}, '
-                 'context-parallel size: {}, '
-                 'hierarchical context-parallel sizes: {}, '
-                 'tensor-model-parallel size: {}, '
-                 'pipeline-model-parallel size: {}'.format(
-                     args.world_size, args.data_parallel_size,
-                     args.context_parallel_size,
-                     args.hierarchical_context_parallel_sizes,
-                     args.tensor_model_parallel_size,
-                     args.pipeline_model_parallel_size))
+            assert (
+                args.micro_batch_size == 1
+            ), "micro_batch_size must be 1 when using sequence packing. To increase compute per micro batch increase the sequence length."
+
+    print_rank_0(
+        'using world size: {}, data-parallel size: {}, '
+        'context-parallel size: {}, '
+        'hierarchical context-parallel sizes: {}, '
+        'tensor-model-parallel size: {}, '
+        'pipeline-model-parallel size: {}'.format(
+            args.world_size,
+            args.data_parallel_size,
+            args.context_parallel_size,
+            args.hierarchical_context_parallel_sizes,
+            args.tensor_model_parallel_size,
+            args.pipeline_model_parallel_size,
+        )
+    )
 
     # Checks.
 
     if args.hierarchical_context_parallel_sizes:
         from numpy import prod
+
         assert args.context_parallel_size == prod(args.hierarchical_context_parallel_sizes)
     if "a2a+p2p" in args.cp_comm_type:
-        assert args.hierarchical_context_parallel_sizes is not None, \
-        "--hierarchical-context-parallel-sizes must be set when a2a+p2p is used in cp comm"
+        assert (
+            args.hierarchical_context_parallel_sizes is not None
+        ), "--hierarchical-context-parallel-sizes must be set when a2a+p2p is used in cp comm"
 
     if args.expert_tensor_parallel_size is None:
         args.expert_tensor_parallel_size = args.tensor_model_parallel_size
 
     # Deprecated arguments.
-    assert args.batch_size is None, '--batch-size argument is no longer ' \
-        'valid, use --micro-batch-size instead'
+    assert args.batch_size is None, (
+        '--batch-size argument is no longer ' 'valid, use --micro-batch-size instead'
+    )
     del args.batch_size
-    assert args.warmup is None, '--warmup argument is no longer valid, use ' \
-        '--lr-warmup-fraction instead'
+    assert args.warmup is None, (
+        '--warmup argument is no longer valid, use ' '--lr-warmup-fraction instead'
+    )
     del args.warmup
-    assert args.model_parallel_size is None, '--model-parallel-size is no ' \
-        'longer valid, use --tensor-model-parallel-size instead'
+    assert args.model_parallel_size is None, (
+        '--model-parallel-size is no ' 'longer valid, use --tensor-model-parallel-size instead'
+    )
     del args.model_parallel_size
 
     if args.checkpoint_activations:
-        print_rank_0('--checkpoint-activations is no longer valid, use --recompute-activations, '
-                     'or, for more control, --recompute-granularity and --recompute-method.')
+        print_rank_0(
+            '--checkpoint-activations is no longer valid, use --recompute-activations, '
+            'or, for more control, --recompute-granularity and --recompute-method.'
+        )
         exit()
     del args.checkpoint_activations
 
@@ -565,31 +632,42 @@ def validate_args(args, defaults={}):
         # arguments that are passed to the program. We check this by
         # ensuring the arg is set to None.
         if getattr(args, key, None) is not None:
-            warn_rank_0('Overriding default arguments for {key}:{v} '
-                        'with {key}:{v2}'.format(key=key, v=defaults[key],
-                                                 v2=getattr(args, key)))
+            warn_rank_0(
+                'Overriding default arguments for {key}:{v} '
+                'with {key}:{v2}'.format(key=key, v=defaults[key], v2=getattr(args, key))
+            )
         else:
             setattr(args, key, defaults[key])
 
     if args.data_path is not None and args.split is None:
         legacy_default_split_value = '969, 30, 1'
-        warn_rank_0('Please specify --split when using --data-path. Using legacy default value '
-                    f'of "{legacy_default_split_value}"')
+        warn_rank_0(
+            'Please specify --split when using --data-path. Using legacy default value '
+            f'of "{legacy_default_split_value}"'
+        )
         args.split = legacy_default_split_value
 
     use_data_path = (args.data_path is not None) or (args.data_args_path is not None)
     if use_data_path:
         # Exactly one of the two has to be None if we use it.
         assert (args.data_path is None) or (args.data_args_path is None)
-    use_per_split_data_path = any(
-        elt is not None
-        for elt in [args.train_data_path, args.valid_data_path, args.test_data_path]) or \
-            args.per_split_data_args_path is not None
+    use_per_split_data_path = (
+        any(
+            elt is not None
+            for elt in [args.train_data_path, args.valid_data_path, args.test_data_path]
+        )
+        or args.per_split_data_args_path is not None
+    )
     if use_per_split_data_path:
-         # Exactly one of the two has to be None if we use it.
-        assert any(elt is not None
-                   for elt in [args.train_data_path, args.valid_data_path, args.test_data_path]) is False or \
-            args.per_split_data_args_path is None
+        # Exactly one of the two has to be None if we use it.
+        assert (
+            any(
+                elt is not None
+                for elt in [args.train_data_path, args.valid_data_path, args.test_data_path]
+            )
+            is False
+            or args.per_split_data_args_path is None
+        )
 
     if args.phase_transition_iterations:
         args.phase_transition_iterations = sorted(
@@ -602,9 +680,7 @@ def validate_args(args, defaults={}):
         args, '_is_global_batch_size_explicitly_specified', args.global_batch_size is not None
     )
     if args.step_batch_size_schedule is not None and is_global_batch_size_explicitly_specified:
-        raise ValueError(
-            'Cannot specify both --step-batch-size-schedule and --global-batch-size'
-        )
+        raise ValueError('Cannot specify both --step-batch-size-schedule and --global-batch-size')
     if args.global_batch_size is None:
         args.global_batch_size = args.micro_batch_size * args.data_parallel_size
         print_rank_0('setting global batch size to {}'.format(args.global_batch_size))
@@ -615,28 +691,36 @@ def validate_args(args, defaults={}):
         args.eval_global_batch_size = args.global_batch_size
     if args.eval_micro_batch_size is None:
         args.eval_micro_batch_size = args.micro_batch_size
-    assert args.eval_global_batch_size % (args.eval_micro_batch_size * args.data_parallel_size) == 0, \
-        f"eval_global_batch_size ({args.eval_global_batch_size}) must be divisible by " \
+    assert (
+        args.eval_global_batch_size % (args.eval_micro_batch_size * args.data_parallel_size) == 0
+    ), (
+        f"eval_global_batch_size ({args.eval_global_batch_size}) must be divisible by "
         f"eval_micro_batch_size ({args.eval_micro_batch_size}) * data_parallel_size ({args.data_parallel_size})"
+    )
 
     if args.perform_rl_step:
         num_generated_samples_per_inference_iteration = (
-            args.grpo_samples_per_iteration * args.grpo_iterations)
+            args.grpo_samples_per_iteration * args.grpo_iterations
+        )
 
         # Ensure that the number of prompts we collect is a multiple of the global batch size.
-        assert num_generated_samples_per_inference_iteration % args.global_batch_size == 0, \
-            f"grpo_group_size * grpo_prompts_per_step * grpo_iterations should be divisible by global_batch_size"
+        assert (
+            num_generated_samples_per_inference_iteration % args.global_batch_size == 0
+        ), f"grpo_group_size * grpo_prompts_per_step * grpo_iterations should be divisible by global_batch_size"
 
         # For now only exit/checkpoint on iterations where we generate data. We don't currently
         # have a way to checkpoint the generated data.
         num_training_iterations_per_inference_iteration = (
-            num_generated_samples_per_inference_iteration // args.global_batch_size)
+            num_generated_samples_per_inference_iteration // args.global_batch_size
+        )
         if args.exit_interval is not None:
-            assert args.exit_interval % num_training_iterations_per_inference_iteration == 0, \
-                f"exit_interval should be divisible by number of global batches per inference iteration."
+            assert (
+                args.exit_interval % num_training_iterations_per_inference_iteration == 0
+            ), f"exit_interval should be divisible by number of global batches per inference iteration."
         if args.save_interval is not None:
-            assert args.save_interval % num_training_iterations_per_inference_iteration == 0, \
-                f"save_interval should be divisible by number of global batches per inference iteration."
+            assert (
+                args.save_interval % num_training_iterations_per_inference_iteration == 0
+            ), f"save_interval should be divisible by number of global batches per inference iteration."
 
     # === Hybrid layer pattern: deprecation handling and validation ===
 
@@ -664,9 +748,12 @@ def validate_args(args, defaults={}):
         )
 
     from megatron.core.ssm.mamba_hybrid_layer_allocation import (
-        Symbols, parse_hybrid_pattern, get_hybrid_total_layer_count,
+        Symbols,
+        get_hybrid_total_layer_count,
         get_hybrid_total_pipeline_segment_count,
+        parse_hybrid_pattern,
     )
+
     sep = Symbols.MTP_SEPARATOR
 
     # Backward compat: convert legacy mtp_hybrid_override_pattern to unified format
@@ -678,7 +765,9 @@ def validate_args(args, defaults={}):
     ):
         main_pattern = args.hybrid_layer_pattern or ''
         mtp_pattern = args.mtp_hybrid_override_pattern
-        args.hybrid_layer_pattern = main_pattern + sep + sep.join([mtp_pattern] * args.mtp_num_layers)
+        args.hybrid_layer_pattern = (
+            main_pattern + sep + sep.join([mtp_pattern] * args.mtp_num_layers)
+        )
         args.mtp_hybrid_override_pattern = None
         print_rank_0(f"Converted legacy MTP pattern to unified: {args.hybrid_layer_pattern}")
 
@@ -772,13 +861,15 @@ def validate_args(args, defaults={}):
                     f"MTP depth count ({inferred_mtp_num_layers}) in pattern "
                     f"'{args.hybrid_layer_pattern}'. "
                     f"Using the inferred value ({inferred_mtp_num_layers}).",
-                    args.rank
+                    args.rank,
                 )
                 args.mtp_num_layers = inferred_mtp_num_layers
 
     # MTP validation
     if args.mtp_num_layers:
-        assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)."
+        assert (
+            not args.use_legacy_models
+        ), "The legacy Megatron models does not support Multi-Token Prediction (MTP)."
         assert args.position_embedding_type == "rope" or args.position_embedding_type == "none", (
             f"Multi-Token Prediction (MTP) is not supported with {args.position_embedding_type} position embedding type."
             + f"The supported position embedding types are rope and none."
@@ -787,14 +878,16 @@ def validate_args(args, defaults={}):
     # Validate MTP args for hybrid vs non-hybrid models
     if args.hybrid_layer_pattern is not None:
         # Mamba/hybrid model MTP validation
-        if args.mtp_num_layers and not (args.hybrid_layer_pattern and sep in args.hybrid_layer_pattern):
+        if args.mtp_num_layers and not (
+            args.hybrid_layer_pattern and sep in args.hybrid_layer_pattern
+        ):
             # Hybrid model wants MTP but no unified pattern - check for legacy args
             if args.mtp_hybrid_override_pattern is None:
                 warn_rank_0(
                     "Hybrid model with --mtp-num-layers but no MTP pattern. "
                     "Use unified --hybrid-layer-pattern with '/' separator (e.g., 'M*M*/MM/MM') "
                     "or legacy --mtp-hybrid-override-pattern for old checkpoints.",
-                    args.rank
+                    args.rank,
                 )
     else:
         # Non-hybrid (GPT) model MTP validation
@@ -803,7 +896,7 @@ def validate_args(args, defaults={}):
                 "--mtp-hybrid-override-pattern is for Mamba/hybrid models only. "
                 "For GPT models, MTP replicates the main transformer layer structure. "
                 "This argument will be ignored.",
-                args.rank
+                args.rank,
             )
 
     # Infer use of MLA from unified pattern
@@ -829,7 +922,9 @@ def validate_args(args, defaults={}):
     if args.pipeline_model_parallel_layout is not None:
         # Parse the input flattened layout to a list and get the vpp size.
         # We will validate the layout more carefully in the TransformerConfig constructor.
-        num_stages = PipelineParallelLayerLayout.get_num_stages_from_str(args.pipeline_model_parallel_layout)
+        num_stages = PipelineParallelLayerLayout.get_num_stages_from_str(
+            args.pipeline_model_parallel_layout
+        )
         assert num_stages % args.pipeline_model_parallel_size == 0, (
             f"The length of pipeline_model_parallel_layout must be divisible"
             f" by pipeline_model_parallel_size ({num_stages=},"
@@ -838,10 +933,15 @@ def validate_args(args, defaults={}):
         args.virtual_pipeline_model_parallel_size = num_stages // args.pipeline_model_parallel_size
         if args.virtual_pipeline_model_parallel_size == 1:
             args.virtual_pipeline_model_parallel_size = None
-    elif args.num_layers_per_virtual_pipeline_stage is not None or args.num_virtual_stages_per_pipeline_rank is not None:
+    elif (
+        args.num_layers_per_virtual_pipeline_stage is not None
+        or args.num_virtual_stages_per_pipeline_rank is not None
+    ):
         if args.num_virtual_stages_per_pipeline_rank is None:
-            assert args.decoder_first_pipeline_num_layers is None and args.decoder_last_pipeline_num_layers is None, \
-                'please use --num-virtual-stages-per-pipeline-rank to specify virtual pipeline parallel degree when enable uneven pipeline parallelism'
+            assert (
+                args.decoder_first_pipeline_num_layers is None
+                and args.decoder_last_pipeline_num_layers is None
+            ), 'please use --num-virtual-stages-per-pipeline-rank to specify virtual pipeline parallel degree when enable uneven pipeline parallelism'
             if args.num_layers is not None:
                 num_layers = args.num_layers
             else:
@@ -853,14 +953,19 @@ def validate_args(args, defaults={}):
             if args.account_for_loss_in_pipeline_split:
                 num_layers += 1
 
-            assert num_layers % args.transformer_pipeline_model_parallel_size == 0, \
-                'number of layers of the model must be divisible pipeline model parallel size'
-            num_layers_per_pipeline_stage = num_layers // args.transformer_pipeline_model_parallel_size
+            assert (
+                num_layers % args.transformer_pipeline_model_parallel_size == 0
+            ), 'number of layers of the model must be divisible pipeline model parallel size'
+            num_layers_per_pipeline_stage = (
+                num_layers // args.transformer_pipeline_model_parallel_size
+            )
 
-            assert num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0, \
-                'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage'
-            args.virtual_pipeline_model_parallel_size = num_layers_per_pipeline_stage // \
-                args.num_layers_per_virtual_pipeline_stage
+            assert (
+                num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0
+            ), 'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage'
+            args.virtual_pipeline_model_parallel_size = (
+                num_layers_per_pipeline_stage // args.num_layers_per_virtual_pipeline_stage
+            )
         else:
             args.virtual_pipeline_model_parallel_size = args.num_virtual_stages_per_pipeline_rank
         if args.virtual_pipeline_model_parallel_size == 1:
@@ -870,7 +975,10 @@ def validate_args(args, defaults={}):
         if args.hybrid_layer_pattern is None:
             args.virtual_pipeline_model_parallel_size = None
 
-        if args.decoder_first_pipeline_num_layers is None and args.decoder_last_pipeline_num_layers is None:
+        if (
+            args.decoder_first_pipeline_num_layers is None
+            and args.decoder_last_pipeline_num_layers is None
+        ):
             # Divisibility check not applicable for T5 models which specify encoder_num_layers
             # and decoder_num_layers, or for hybrid models using --hybrid-layer-pattern.
             if args.num_layers is not None and args.hybrid_layer_pattern is None:
@@ -882,61 +990,74 @@ def validate_args(args, defaults={}):
                 if args.account_for_loss_in_pipeline_split:
                     num_layers += 1
 
-                assert num_layers % args.transformer_pipeline_model_parallel_size == 0, \
-                    'Number of layers should be divisible by the pipeline-model-parallel size'
+                assert (
+                    num_layers % args.transformer_pipeline_model_parallel_size == 0
+                ), 'Number of layers should be divisible by the pipeline-model-parallel size'
 
     if args.virtual_pipeline_model_parallel_size is not None:
         if args.overlap_p2p_comm:
-            assert args.pipeline_model_parallel_size > 1, \
-                'When interleaved schedule is used, pipeline-model-parallel size '\
+            assert args.pipeline_model_parallel_size > 1, (
+                'When interleaved schedule is used, pipeline-model-parallel size '
                 'should be greater than 1'
+            )
         else:
-            assert args.pipeline_model_parallel_size > 2, \
-                'When interleaved schedule is used and p2p communication overlap is disabled, '\
-                'pipeline-model-parallel size should be greater than 2 to avoid having multiple '\
+            assert args.pipeline_model_parallel_size > 2, (
+                'When interleaved schedule is used and p2p communication overlap is disabled, '
+                'pipeline-model-parallel size should be greater than 2 to avoid having multiple '
                 'p2p sends and recvs between same 2 ranks per communication batch'
+            )
     else:
         # Overlap P2P communication is disabled if not using the interleaved schedule.
         args.overlap_p2p_comm = False
         args.align_param_gather = False
         # Only print warning if PP size > 1.
         if args.rank == 0 and args.pipeline_model_parallel_size > 1:
-            print('WARNING: Setting args.overlap_p2p_comm and args.align_param_gather to False '
+            print(
+                'WARNING: Setting args.overlap_p2p_comm and args.align_param_gather to False '
                 'since non-interleaved schedule does not support overlapping p2p communication '
-                'and aligned param AG')
+                'and aligned param AG'
+            )
 
     print_rank_0(
         f"Number of virtual stages per pipeline stage: {args.virtual_pipeline_model_parallel_size}"
     )
 
     if args.overlap_param_gather:
-        assert args.use_distributed_optimizer or args.use_megatron_fsdp \
-            or args.optimizer == 'dist_muon', \
-            '--overlap-param-gather only supported with distributed optimizer, megatron fsdp, or dist_muon'
-        assert args.overlap_grad_reduce, \
-            'Must use --overlap-param-gather with --overlap-grad-reduce'
-        assert not args.use_legacy_models, \
-            '--overlap-param-gather only supported with MCore models'
+        assert (
+            args.use_distributed_optimizer
+            or args.use_megatron_fsdp
+            or args.optimizer == 'dist_muon'
+        ), '--overlap-param-gather only supported with distributed optimizer, megatron fsdp, or dist_muon'
+        assert (
+            args.overlap_grad_reduce
+        ), 'Must use --overlap-param-gather with --overlap-grad-reduce'
+        assert not args.use_legacy_models, '--overlap-param-gather only supported with MCore models'
 
     if args.use_torch_fsdp2:
-        assert is_torch_min_version("2.4.0"), \
-            'FSDP2 requires PyTorch >= 2.4.0 with FSDP 2 support.'
-        assert args.pipeline_model_parallel_size == 1, \
-            '--use-torch-fsdp2 is not supported with pipeline parallelism'
-        assert args.expert_model_parallel_size == 1, \
-            '--use-torch-fsdp2 is not supported with expert parallelism'
-        assert not args.use_distributed_optimizer, \
-            "--use-torch-fsdp2 is not supported with MCore's distributed optimizer"
-        assert not args.gradient_accumulation_fusion, \
-            '--use-torch-fsdp2 is not supported with gradient accumulation fusion'
-        assert args.ckpt_format in ('torch_dist', 'torch_dcp'), \
-            '--use-torch-fsdp2 requires --ckpt-format torch_dist or torch_dcp'
-        assert args.untie_embeddings_and_output_weights, \
-            '--use-torch-fsdp2 requires --untie-embeddings-and-output-weights'
-        assert not args.fp16, \
-            '--use-torch-fsdp2 not supported with fp16 yet'
-        assert os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1", \
-            'FSDP always requires CUDA_DEVICE_MAX_CONNECTIONS value large than one'
+        assert is_torch_min_version("2.4.0"), 'FSDP2 requires PyTorch >= 2.4.0 with FSDP 2 support.'
+        assert (
+            args.pipeline_model_parallel_size == 1
+        ), '--use-torch-fsdp2 is not supported with pipeline parallelism'
+        assert (
+            args.expert_model_parallel_size == 1
+        ), '--use-torch-fsdp2 is not supported with expert parallelism'
+        assert (
+            not args.use_distributed_optimizer
+        ), "--use-torch-fsdp2 is not supported with MCore's distributed optimizer"
+        assert (
+            not args.gradient_accumulation_fusion
+        ), '--use-torch-fsdp2 is not supported with gradient accumulation fusion'
+        assert args.ckpt_format in (
+            'torch_dist',
+            'torch_dcp',
+        ), '--use-torch-fsdp2 requires --ckpt-format torch_dist or torch_dcp'
+        assert (
+            args.untie_embeddings_and_output_weights
+        ), '--use-torch-fsdp2 requires --untie-embeddings-and-output-weights'
+        assert not args.fp16, '--use-torch-fsdp2 not supported with fp16 yet'
+        assert (
+            os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1"
+        ), 'FSDP always requires CUDA_DEVICE_MAX_CONNECTIONS value large than one'
 
         if args.fp8_param_gather and is_te_min_version("2.0.0"):
             args.fp8_param_gather = False
@@ -954,18 +1075,27 @@ def validate_args(args, defaults={}):
             )
 
     if args.overlap_param_gather_with_optimizer_step:
-        assert args.use_distributed_optimizer, \
-            '--overlap-param-gather-with-optimizer-step only supported with distributed optimizer'
-        assert args.overlap_param_gather, \
-            'Must use --overlap-param-gather-with-optimizer-step with --overlap-param-gather'
-        assert args.virtual_pipeline_model_parallel_size is not None, \
-            '--overlap-param-gather-with-optimizer-step only supported with interleaved pipeline parallelism'
-        assert not args.use_dist_ckpt, \
-            '--overlap-param-gather-with-optimizer-step not supported with distributed checkpointing yet'
+        assert (
+            args.use_distributed_optimizer
+        ), '--overlap-param-gather-with-optimizer-step only supported with distributed optimizer'
+        assert (
+            args.overlap_param_gather
+        ), 'Must use --overlap-param-gather-with-optimizer-step with --overlap-param-gather'
+        assert (
+            args.virtual_pipeline_model_parallel_size is not None
+        ), '--overlap-param-gather-with-optimizer-step only supported with interleaved pipeline parallelism'
+        assert (
+            not args.use_dist_ckpt
+        ), '--overlap-param-gather-with-optimizer-step not supported with distributed checkpointing yet'
 
     # Map string data-type to torch.dtype.
     dtype_map = {
-        'fp32': torch.float32, 'bf16': torch.bfloat16, 'fp16': torch.float16, 'fp8': torch.uint8, 'auto': None, None: None,
+        'fp32': torch.float32,
+        'bf16': torch.bfloat16,
+        'fp16': torch.float16,
+        'fp8': torch.uint8,
+        'auto': None,
+        None: None,
     }
     map_dtype = lambda d: d if isinstance(d, torch.dtype) else dtype_map[d]
 
@@ -983,12 +1113,18 @@ def validate_args(args, defaults={}):
         args.megatron_fsdp_grad_comm_dtype = torch.bfloat16
 
     if args.fp8_param_gather:
-        assert args.use_distributed_optimizer or args.use_torch_fsdp2 or args.use_megatron_fsdp or not torch.is_grad_enabled(), \
-            '--fp8-param-gather only supported with distributed optimizer, torch fsdp2, megatron fsdp, or inference mode'
+        assert (
+            args.use_distributed_optimizer
+            or args.use_torch_fsdp2
+            or args.use_megatron_fsdp
+            or not torch.is_grad_enabled()
+        ), '--fp8-param-gather only supported with distributed optimizer, torch fsdp2, megatron fsdp, or inference mode'
 
     # FP4 and FP8 are mutually exclusive
     if args.fp4 and args.fp8:
-        raise ValueError("--fp4-format and --fp8-format cannot be used simultaneously. Please choose one.")
+        raise ValueError(
+            "--fp4-format and --fp8-format cannot be used simultaneously. Please choose one."
+        )
 
     # FP4 param requires FP4 mode
     if args.fp4_param_gather and not args.fp4:
@@ -996,7 +1132,9 @@ def validate_args(args, defaults={}):
 
     # FP4 requires TE >= 2.7.0.dev0
     if args.fp4 and not is_te_min_version("2.7.0.dev0"):
-        raise ValueError("--fp4-format requires Transformer Engine >= 2.7.0.dev0 for NVFP4BlockScaling support.")
+        raise ValueError(
+            "--fp4-format requires Transformer Engine >= 2.7.0.dev0 for NVFP4BlockScaling support."
+        )
 
     if (
         args.fp8_recipe == 'mxfp8'
@@ -1021,8 +1159,10 @@ def validate_args(args, defaults={}):
         # Optimizer step MXFP8 buffer operation that is not relevant or supported for Megatron-FSDP.
         args.reuse_grad_buf_for_mxfp8_param_ag = False
         # Optimizer compatibility check.
-        assert args.optimizer in ('sgd', 'adam'), \
-            f"Megatron-FSDP does not support the {args.optimizer} optimizer yet."
+        assert args.optimizer in (
+            'sgd',
+            'adam',
+        ), f"Megatron-FSDP does not support the {args.optimizer} optimizer yet."
 
         if (
             args.data_parallel_sharding_strategy in ["optim_grads_params", "optim_grads"]
@@ -1034,15 +1174,22 @@ def validate_args(args, defaults={}):
             )
 
         if args.data_parallel_sharding_strategy == "optim_grads_params":
-            assert args.check_weight_hash_across_dp_replicas_interval is None, \
-                'check_weight_hash_across_dp_replicas_interval is not supported with optim_grads_params'
+            assert (
+                args.check_weight_hash_across_dp_replicas_interval is None
+            ), 'check_weight_hash_across_dp_replicas_interval is not supported with optim_grads_params'
+
+        assert (
+            os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1"
+        ), 'FSDP requires CUDA_DEVICE_MAX_CONNECTIONS > 1 or unset.'
 
-        assert os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') != "1", \
-            'FSDP requires CUDA_DEVICE_MAX_CONNECTIONS > 1 or unset.'
+        assert (
+            args.ckpt_format == "fsdp_dtensor"
+        ), "Megatron-FSDP requires the `fsdp_dtensor` checkpointing format."
+
+        assert (
+            args.ckpt_format == "fsdp_dtensor"
+        ), "Megatron-FSDP requires the `fsdp_dtensor` checkpointing format."
 
-        assert args.ckpt_format == "fsdp_dtensor", \
-            "Megatron-FSDP requires the `fsdp_dtensor` checkpointing format."
-    
     if args.nccl_ub and args.use_megatron_fsdp:
         # In Megatron-LM, required implementation for manual registration is already provided.
         # So we enable the manual registration by default when nccl-ub and use_megatron_fsdp is set.
@@ -1050,8 +1197,10 @@ def validate_args(args, defaults={}):
         warn_rank_0('FSDP manual registration is enabled by default when nccl-ub is enabled')
 
     if args.fsdp_manual_registration:
-        assert args.use_megatron_fsdp, "FSDP manual registration is only supported with Megatron FSDP."
-        assert args.nccl_ub, "FSDP manual registration is only supported with --nccl-ub argument."      
+        assert (
+            args.use_megatron_fsdp
+        ), "FSDP manual registration is only supported with Megatron FSDP."
+        assert args.nccl_ub, "FSDP manual registration is only supported with --nccl-ub argument."
 
     # Parameters dtype.
     args.params_dtype = torch.float
@@ -1062,16 +1211,19 @@ def validate_args(args, defaults={}):
         # where NaNs in grads / loss are signal to the loss scaler.
         if not args.loss_scale:
             args.check_for_nan_in_loss_and_grad = False
-            warn_rank_0('Setting args.check_for_nan_in_loss_and_grad to False since '
-                        'dynamic loss scaling is being used')
+            warn_rank_0(
+                'Setting args.check_for_nan_in_loss_and_grad to False since '
+                'dynamic loss scaling is being used'
+            )
     if args.bf16:
         assert not args.fp16
         args.params_dtype = torch.bfloat16
         # bfloat16 requires gradient accumulation and all-reduce to
         # be done in fp32.
         if args.accumulate_allreduce_grads_in_fp32:
-            assert args.main_grads_dtype == torch.float32, \
-                "--main-grads-dtype can only be fp32 when --accumulate-allreduce-grads-in-fp32 is set"
+            assert (
+                args.main_grads_dtype == torch.float32
+            ), "--main-grads-dtype can only be fp32 when --accumulate-allreduce-grads-in-fp32 is set"
 
         if args.grad_reduce_in_bf16:
             args.accumulate_allreduce_grads_in_fp32 = False
@@ -1079,21 +1231,31 @@ def validate_args(args, defaults={}):
             args.accumulate_allreduce_grads_in_fp32 = True
             print_rank_0('accumulate and all-reduce gradients in fp32 for bfloat16 data type.')
     if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope:
-        assert not args.check_for_nan_in_loss_and_grad, \
-        "--no-check-for-nan-in-loss-and-grad should be set with --cuda-graph-scope=full_iteration for training. Note: If you are trying to use full_iteration CUDA graphs for inference, please use --cuda-graph-scope full_iteration_inference instead"
-    
-    if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration_inference in args.cuda_graph_scope:
+        assert (
+            not args.check_for_nan_in_loss_and_grad
+        ), "--no-check-for-nan-in-loss-and-grad should be set with --cuda-graph-scope=full_iteration for training. Note: If you are trying to use full_iteration CUDA graphs for inference, please use --cuda-graph-scope full_iteration_inference instead"
+
+    if (
+        args.cuda_graph_impl == "local"
+        and CudaGraphScope.full_iteration_inference in args.cuda_graph_scope
+    ):
         if args.fp8 is not None:
-            assert args.transformer_impl == "inference_optimized", \
-                "fp8 with full_iteration_inference CUDA graphs is only supported with " \
+            assert args.transformer_impl == "inference_optimized", (
+                "fp8 with full_iteration_inference CUDA graphs is only supported with "
                 "--transformer-impl=inference_optimized"
-            assert args.fp8_recipe == "mxfp8", \
-                "Only --fp8-recipe=mxfp8 is supported with full_iteration_inference CUDA graphs"
+            )
+            assert (
+                args.fp8_recipe == "mxfp8"
+            ), "Only --fp8-recipe=mxfp8 is supported with full_iteration_inference CUDA graphs"
 
     if args.cuda_graph_impl == 'local':
-        assert args.inference_dynamic_batching_num_cuda_graphs > 0 or args.inference_dynamic_batching_num_cuda_graphs == -1, \
-            'inference_dynamic_batching_num_cuda_graphs should be a positive integer or -1' \
+        assert (
+            args.inference_dynamic_batching_num_cuda_graphs > 0
+            or args.inference_dynamic_batching_num_cuda_graphs == -1
+        ), (
+            'inference_dynamic_batching_num_cuda_graphs should be a positive integer or -1'
             '-1 means that we will automatically determine the number of CUDA graphs to capture based on the `max_requests` value.'
+        )
 
     print_rank_0('using {} for parameters ...'.format(args.params_dtype))
 
@@ -1110,55 +1272,44 @@ def validate_args(args, defaults={}):
     if args.rl_use_sequence_packing:
         args.consumed_train_bins = 0
 
-    # Support for variable sequence lengths across batches/microbatches.
-    # set it if the dataloader supports generation of variable sequence lengths
-    # across batches/microbatches. Due to additional communication overhead
-    # during pipeline parallelism, it should not be set if sequence length
-    # is constant during training.
-    args.variable_seq_lengths = False
-
     # Iteration-based training.
     # Skip these checks when skip_train is set: LR config is irrelevant.
     if args.train_iters and not args.skip_train:
         # If we use iteration-based training, make sure the
         # sample-based options are off.
-        assert args.train_samples is None, \
-            'expected iteration-based training'
-        assert args.lr_decay_samples is None, \
-            'expected iteration-based learning rate decay'
-        assert args.lr_warmup_samples == 0, \
-            'expected iteration-based learning rate warmup'
+        assert args.train_samples is None, 'expected iteration-based training'
+        assert args.lr_decay_samples is None, 'expected iteration-based learning rate decay'
+        assert args.lr_warmup_samples == 0, 'expected iteration-based learning rate warmup'
         if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_iters == 0, \
-                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
+            assert (
+                args.lr_warmup_iters == 0
+            ), 'can only specify one of lr-warmup-fraction and lr-warmup-iters'
 
     # Sample-based training.
     if args.train_samples and not args.skip_train:
         # If we use sample-based training, make sure the
         # iteration-based options are off.
-        assert args.train_iters is None, \
-            'expected sample-based training'
-        assert args.lr_decay_iters is None, \
-            'expected sample-based learning rate decay'
-        assert args.lr_warmup_iters == 0, \
-            'expected sample-based learnig rate warmup'
+        assert args.train_iters is None, 'expected sample-based training'
+        assert args.lr_decay_iters is None, 'expected sample-based learning rate decay'
+        assert args.lr_warmup_iters == 0, 'expected sample-based learnig rate warmup'
         if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_samples == 0, \
-                'can only specify one of lr-warmup-fraction ' \
-                'and lr-warmup-samples'
+            assert args.lr_warmup_samples == 0, (
+                'can only specify one of lr-warmup-fraction ' 'and lr-warmup-samples'
+            )
 
     if args.num_layers is not None:
-        assert args.encoder_num_layers is None, \
-            'cannot have both num-layers and encoder-num-layers specified'
+        assert (
+            args.encoder_num_layers is None
+        ), 'cannot have both num-layers and encoder-num-layers specified'
         args.encoder_num_layers = args.num_layers
     else:
-        assert args.encoder_num_layers is not None, \
-            'either num-layers or encoder-num-layers should be specified'
+        assert (
+            args.encoder_num_layers is not None
+        ), 'either num-layers or encoder-num-layers should be specified'
         args.num_layers = args.encoder_num_layers
 
     # Check required arguments.
-    required_args = ['num_layers', 'hidden_size', 'num_attention_heads',
-                     'max_position_embeddings']
+    required_args = ['num_layers', 'hidden_size', 'num_attention_heads', 'max_position_embeddings']
     for req_arg in required_args:
         _check_arg_is_not_none(args, req_arg)
 
@@ -1179,9 +1330,10 @@ def validate_args(args, defaults={}):
         args.kv_channels = args.hidden_size // args.num_attention_heads
 
     if args.seq_length is not None and args.context_parallel_size > 1:
-        assert args.seq_length % (args.context_parallel_size * 2) == 0, \
-            'seq-length should be a multiple of 2 * context-parallel-size ' \
+        assert args.seq_length % (args.context_parallel_size * 2) == 0, (
+            'seq-length should be a multiple of 2 * context-parallel-size '
             'if context-parallel-size > 1.'
+        )
 
     if args.seq_length is not None:
         assert args.encoder_seq_length is None
@@ -1191,9 +1343,10 @@ def validate_args(args, defaults={}):
         args.seq_length = args.encoder_seq_length
 
     if args.seq_length is not None:
-        assert args.max_position_embeddings >= args.seq_length, \
-            f"max_position_embeddings ({args.max_position_embeddings}) must be greater than " \
+        assert args.max_position_embeddings >= args.seq_length, (
+            f"max_position_embeddings ({args.max_position_embeddings}) must be greater than "
             f"or equal to seq_length ({args.seq_length})."
+        )
     if args.decoder_seq_length is not None:
         assert args.max_position_embeddings >= args.decoder_seq_length
     if args.lr is not None:
@@ -1212,8 +1365,9 @@ def validate_args(args, defaults={}):
     if args.fp16_lm_cross_entropy:
         assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
     if args.fp32_residual_connection:
-        assert args.fp16 or args.bf16, \
-            'residual connection in fp32 only supported when using fp16 or bf16.'
+        assert (
+            args.fp16 or args.bf16
+        ), 'residual connection in fp32 only supported when using fp16 or bf16.'
 
     if args.moe_grouped_gemm:
         dc = torch.cuda.get_device_capability()
@@ -1242,30 +1396,33 @@ def validate_args(args, defaults={}):
     # Persistent fused layer norm.
     if not is_torch_min_version("1.11.0a0"):
         args.no_persist_layer_norm = True
-        print_rank_0('Persistent fused layer norm kernel is supported from '
-                     'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
-                     'Defaulting to no_persist_layer_norm=True')
+        print_rank_0(
+            'Persistent fused layer norm kernel is supported from '
+            'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
+            'Defaulting to no_persist_layer_norm=True'
+        )
 
     # Activation recomputing.
     if args.distribute_saved_activations:
-        assert args.tensor_model_parallel_size > 1, 'can distribute ' \
-            'recomputed activations only across tensor model ' \
-            'parallel groups'
-        assert args.recompute_granularity == 'full', \
-            'distributed recompute activations is only '\
-            'application to full recompute granularity'
-        assert args.recompute_method is not None, \
-            'for distributed recompute activations to work you '\
-            'need to use a recompute method '
-        assert is_torch_min_version("1.10.0a0"), \
-            'distributed recompute activations are supported for pytorch ' \
-            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
+        assert args.tensor_model_parallel_size > 1, (
+            'can distribute ' 'recomputed activations only across tensor model ' 'parallel groups'
+        )
+        assert args.recompute_granularity == 'full', (
+            'distributed recompute activations is only ' 'application to full recompute granularity'
+        )
+        assert args.recompute_method is not None, (
+            'for distributed recompute activations to work you ' 'need to use a recompute method '
+        )
+        assert is_torch_min_version("1.10.0a0"), (
+            'distributed recompute activations are supported for pytorch '
+            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current '
             f'pytorch version is v{get_torch_version()}.'
+        )
 
     if args.recompute_granularity == 'selective':
-        assert args.recompute_method is None, \
-            'recompute method is not yet supported for ' \
-            'selective recomputing granularity'
+        assert args.recompute_method is None, (
+            'recompute method is not yet supported for ' 'selective recomputing granularity'
+        )
 
     # disable sequence parallelism when tp=1
     # to avoid change in numerics when
@@ -1279,19 +1436,63 @@ def validate_args(args, defaults={}):
         args.sequence_parallel = False
 
     if args.tp_comm_overlap:
-        assert args.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
+        assert (
+            args.sequence_parallel == True
+        ), 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
+
+    if args.dynamic_context_parallel:
+        assert (
+            not args.enable_cuda_graph
+        ), 'Dynamic context parallelism not supported with CUDA Graph'
+        assert (
+            not args.use_megatron_fsdp
+        ), 'Dynamic context parallelism not supported with Megatron FSDP'
+        assert (
+            args.dataloader_type == 'single'
+        ), 'Dynamic context parallelism only supported with single dataloader type'
+        assert (
+            args.calculate_per_token_loss
+        ), 'Dynamic context parallelism must be used with --calculate-per-token-loss'
+        if args.sequence_packing_scheduler is None:
+            args.sequence_packing_scheduler = 'default_dynamic_cp'
+        if args.sequence_packing_scheduler != 'default_dynamic_cp':
+            raise ValueError(
+                'Dynamic context parallelism requires '
+                'sequence_packing_scheduler=default_dynamic_cp'
+            )
+
+        dp_cp_size = args.data_parallel_size * args.context_parallel_size
+        assert args.min_dynamic_context_parallel_size <= dp_cp_size, (
+            f'min_dynamic_context_parallel_size ({args.min_dynamic_context_parallel_size}) '
+            f'must be <= dp_size * cp_size ({dp_cp_size})'
+        )
+
+        import warnings
 
-    if args.hybrid_context_parallel:
-        assert not args.pipeline_model_parallel_size > 1, 'Hybrid context parallelism not supported with pipeline parallelism'
-        assert not args.enable_cuda_graph, 'Hybrid context parallelism not supported with CUDA Graph'
-        assert not args.use_megatron_fsdp, 'Hybrid context parallelism not supported with Megatron FSDP'
-        assert args.dataloader_type == 'single', 'Hybrid context parallelism only supported with single dataloader type'
-        assert args.calculate_per_token_loss, 'Hybrid context parallelism must be used with --calculate-per-token-loss'
+        warnings.warn(
+            f"Dynamic CP enabled: dp_size * context_parallel_size="
+            f"{args.data_parallel_size * args.context_parallel_size} "
+            f"will be used as the maximum dynamic CP group size. "
+            f"Dynamic CP groups will range from "
+            f"min_dynamic_context_parallel_size={args.min_dynamic_context_parallel_size} "
+            f"to {args.data_parallel_size * args.context_parallel_size}."
+        )
+
+    if args.sequence_packing_scheduler is not None:
+        if args.sequence_packing_scheduler == 'dp_balanced':
+            total_cp_ranks = args.context_parallel_size
+        else:
+            total_cp_ranks = args.data_parallel_size * args.context_parallel_size
+        assert total_cp_ranks * args.max_seqlen_per_dp_cp_rank >= args.seq_length, (
+            f'Packed sequence buffer size ({total_cp_ranks * args.max_seqlen_per_dp_cp_rank}) '
+            f'must be >= single sequence max length ({args.seq_length})'
+        )
 
     # disable async_tensor_model_parallel_allreduce when
     # model parallel memory optimization is enabled
-    if (args.tensor_model_parallel_size > 1 or args.context_parallel_size > 1) \
-        and get_device_arch_version() < 10:
+    if (
+        args.tensor_model_parallel_size > 1 or args.context_parallel_size > 1
+    ) and get_device_arch_version() < 10:
         # CUDA_DEVICE_MAX_CONNECTIONS requirement no longer exists since the Blackwell architecture
         if args.use_torch_fsdp2 or args.use_megatron_fsdp:
             fsdp_impl = "Torch-FSDP2" if args.use_torch_fsdp2 else "Megatron-FSDP"
@@ -1314,9 +1515,10 @@ def validate_args(args, defaults={}):
                 args.rank,
             )
         else:
-            assert os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') == "1", \
-                "Using tensor model parallelism or context parallelism require setting the environment variable " \
+            assert os.environ.get('CUDA_DEVICE_MAX_CONNECTIONS') == "1", (
+                "Using tensor model parallelism or context parallelism require setting the environment variable "
                 "CUDA_DEVICE_MAX_CONNECTIONS to 1"
+            )
 
     # Setting FSDP communication groups for high priority streams for Blackwell and later architectures
     # Assigning high priority to communication streams ensures that communication kernels are scheduled
@@ -1324,7 +1526,7 @@ def validate_args(args, defaults={}):
     if args.use_torch_fsdp2 or args.use_megatron_fsdp and get_device_arch_version() >= 10:
         if 'dp_cp' not in args.high_priority_stream_groups:
             args.high_priority_stream_groups.append('dp_cp')
-        if args.expert_model_parallel_size  > 1 and 'ep_dp' not in args.high_priority_stream_groups:
+        if args.expert_model_parallel_size > 1 and 'ep_dp' not in args.high_priority_stream_groups:
             args.high_priority_stream_groups.append('ep_dp')
 
     # Disable bias gelu fusion if we are disabling bias altogether
@@ -1336,21 +1538,24 @@ def validate_args(args, defaults={}):
         args.add_qkv_bias = True
 
     if args.qk_clip:
-        assert is_te_min_version("2.9.0"), \
-            '--qk-clip is only supported with TE >= 2.9.0.'
-        assert 0.0 < args.qk_clip_alpha < 1.0, \
-            '--qk-clip-alpha must be between 0.0 and 1.0 when using --qk-clip.'
-        assert args.qk_clip_threshold > 0, \
-            '--qk-clip-threshold must be greater than 0 when using --qk-clip.'
+        assert is_te_min_version("2.9.0"), '--qk-clip is only supported with TE >= 2.9.0.'
+        assert (
+            0.0 < args.qk_clip_alpha < 1.0
+        ), '--qk-clip-alpha must be between 0.0 and 1.0 when using --qk-clip.'
+        assert (
+            args.qk_clip_threshold > 0
+        ), '--qk-clip-threshold must be greater than 0 when using --qk-clip.'
 
     # decoupled log max attention logit check
     if args.log_max_attention_logit:
-        assert is_te_min_version("2.9.0"), \
-            '--log-max-attention-logit is only supported with TE >= 2.9.0.'
+        assert is_te_min_version(
+            "2.9.0"
+        ), '--log-max-attention-logit is only supported with TE >= 2.9.0.'
 
     if args.decoupled_lr is not None or args.decoupled_min_lr is not None:
-        assert not args.use_legacy_models, \
-            '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
+        assert (
+            not args.use_legacy_models
+        ), '--decoupled-lr and --decoupled-min-lr is not supported in legacy models.'
 
     # Legacy RoPE arguments
     if args.use_rotary_position_embeddings:
@@ -1373,8 +1578,9 @@ def validate_args(args, defaults={}):
 
     # MultiModal rotary embeddings arguments
     if args.position_embedding_type == "mrope":
-        assert args.mrope_section is not None, \
-            '--mrope-section should be set when using --position-embedding-type mrope.'
+        assert (
+            args.mrope_section is not None
+        ), '--mrope-section should be set when using --position-embedding-type mrope.'
 
     # MoE Spec check
     if args.num_experts == 0:
@@ -1388,13 +1594,19 @@ def validate_args(args, defaults={}):
         assert not args.use_legacy_models, "Context parallelism is not supported in legacy models."
 
     # Expert parallelism check
-    if args.expert_model_parallel_size  > 1:
-        assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism"
-        assert args.num_experts % args.expert_model_parallel_size == 0, \
-            "Number of experts should be a multiple of expert model parallel_size."
+    if args.expert_model_parallel_size > 1:
+        assert (
+            args.num_experts is not None
+        ), "num_experts must be non None to use expert model parallelism"
+        assert (
+            args.num_experts % args.expert_model_parallel_size == 0
+        ), "Number of experts should be a multiple of expert model parallel_size."
 
     # MoE router check
-    if isinstance(args.moe_router_load_balancing_type, list) and len(args.moe_router_load_balancing_type) == 1:
+    if (
+        isinstance(args.moe_router_load_balancing_type, list)
+        and len(args.moe_router_load_balancing_type) == 1
+    ):
         args.moe_router_load_balancing_type = args.moe_router_load_balancing_type[0]
     if isinstance(args.moe_aux_loss_coeff, list) and len(args.moe_aux_loss_coeff) == 1:
         args.moe_aux_loss_coeff = args.moe_aux_loss_coeff[0]
@@ -1406,20 +1618,26 @@ def validate_args(args, defaults={}):
     # torch_dcp (torch.distributed.checkpoint) checkpointing format checks.
     if args.ckpt_format == "torch_dcp":
         assert args.use_torch_fsdp2, "--ckpt-format torch_dcp is only tested with FSDP."
-        assert args.tensor_model_parallel_size <= 1, \
-            "--ckpt-format torch_dcp is not tested with megatron tensor parallelism."
-        assert args.pipeline_model_parallel_size <= 1, \
-            "--ckpt-format torch_dcp is not tested with megatron pipeline parallelism."
+        assert (
+            args.tensor_model_parallel_size <= 1
+        ), "--ckpt-format torch_dcp is not tested with megatron tensor parallelism."
+        assert (
+            args.pipeline_model_parallel_size <= 1
+        ), "--ckpt-format torch_dcp is not tested with megatron pipeline parallelism."
 
     # fsdp_dtensor checkpointing format checks.
     if args.ckpt_format == "fsdp_dtensor":
-        assert args.use_megatron_fsdp, "--ckpt-format fsdp_dtensor is only tested with Megatron FSDP."
+        assert (
+            args.use_megatron_fsdp
+        ), "--ckpt-format fsdp_dtensor is only tested with Megatron FSDP."
 
     # Data blend checks
-    assert args.mock_data + \
-           bool(args.data_path) + \
-           any([args.train_data_path, args.valid_data_path, args.test_data_path]) \
-           <= 1, "A single data source must be provided in training mode, else None"
+    assert (
+        args.mock_data
+        + bool(args.data_path)
+        + any([args.train_data_path, args.valid_data_path, args.test_data_path])
+        <= 1
+    ), "A single data source must be provided in training mode, else None"
 
     if args.fim_data:
         extra_tokens = [
@@ -1432,16 +1650,21 @@ def validate_args(args, defaults={}):
         assert not args.mock_data, "Mock dataset is not supported with FIM dataset."
         assert args.fim_rate, "--fim-rate should be specified."
         assert args.fim_spm_rate, "--fim-spm-rate should be specified."
-        assert all(token is not None for token in extra_tokens), "FIM extra tokens should be specified."
+        assert all(
+            token is not None for token in extra_tokens
+        ), "FIM extra tokens should be specified."
 
     # Deterministic mode
     if args.deterministic_mode:
         assert not args.use_flash_attn, "Flash attention can not be used in deterministic mode."
-        assert not args.cross_entropy_loss_fusion, "Cross Entropy Fusion is currently not deterministic."
+        assert (
+            not args.cross_entropy_loss_fusion
+        ), "Cross Entropy Fusion is currently not deterministic."
 
         all_reduce_choices = ["Tree", "Ring", "CollnetDirect", "CollnetChain", "^NVLS"]
-        assert os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices, \
-            f"NCCL_ALGO must be one of {all_reduce_choices}."
+        assert (
+            os.getenv("NCCL_ALGO", -1) != -1 and os.getenv("NCCL_ALGO") in all_reduce_choices
+        ), f"NCCL_ALGO must be one of {all_reduce_choices}."
 
         torch.use_deterministic_algorithms(True)
 
@@ -1472,9 +1695,13 @@ def validate_args(args, defaults={}):
             args.use_distributed_optimizer = False
 
         assert not args.use_torch_fsdp2, "Emerging optimizer does not support Torch-FSDP2 for now."
-        assert not args.use_megatron_fsdp, "Emerging optimizer does not support Megatron-FSDP for now."
-        assert args.ckpt_format in ["torch", "torch_dist"], "Emerging optimizer supports torch and torch_dist checkpoint format."
-
+        assert (
+            not args.use_megatron_fsdp
+        ), "Emerging optimizer does not support Megatron-FSDP for now."
+        assert args.ckpt_format in [
+            "torch",
+            "torch_dist",
+        ], "Emerging optimizer supports torch and torch_dist checkpoint format."
 
     # Make sure all functionality that requires Gloo process groups is disabled.
     if not args.use_gloo_process_groups:
@@ -1485,11 +1712,14 @@ def validate_args(args, defaults={}):
             assert args.use_dist_ckpt
 
             if args.dist_ckpt_optim_fully_reshardable:
-                assert not args.distrib_optim_fully_reshardable_mem_efficient, \
-                    '--distrib-optim-fully-reshardable-mem-efficient requires -enable-gloo-process-groups'
+                assert (
+                    not args.distrib_optim_fully_reshardable_mem_efficient
+                ), '--distrib-optim-fully-reshardable-mem-efficient requires -enable-gloo-process-groups'
 
     if args.fake_process_group:
-        assert args.moe_token_dispatcher_type != "flex", "Fake process group is not supported with flex token dispatcher."
+        assert (
+            args.moe_token_dispatcher_type != "flex"
+        ), "Fake process group is not supported with flex token dispatcher."
         # Disable nan check for fake process group
         args.check_for_nan_in_loss_and_grad = False
         warn_rank_0('check_for_nan_in_loss_and_grad is set to False for fake process group.')
@@ -1499,23 +1729,31 @@ def validate_args(args, defaults={}):
 
     # Checkpointing
     if args.ckpt_fully_parallel_save_deprecated and args.rank == 0:
-        print('--ckpt-fully-parallel-save flag is deprecated and has no effect.'
-              ' Use --no-ckpt-fully-parallel-save to disable parallel save.')
+        print(
+            '--ckpt-fully-parallel-save flag is deprecated and has no effect.'
+            ' Use --no-ckpt-fully-parallel-save to disable parallel save.'
+        )
     if (
         args.use_dist_ckpt
         and not args.ckpt_fully_parallel_save
         and args.use_distributed_optimizer
         and args.rank == 0
     ):
-        print('Warning: With non-parallel ckpt save and DistributedOptimizer,'
-              ' it will be impossible to resume training with different parallelism.'
-              ' Consider removing flag --no-ckpt-fully-parallel-save.')
+        print(
+            'Warning: With non-parallel ckpt save and DistributedOptimizer,'
+            ' it will be impossible to resume training with different parallelism.'
+            ' Consider removing flag --no-ckpt-fully-parallel-save.'
+        )
     if args.use_dist_ckpt_deprecated and args.rank == 0:
-        print('--use-dist-ckpt is deprecated and has no effect.'
-              ' Use --ckpt-format to select the checkpoint format.')
+        print(
+            '--use-dist-ckpt is deprecated and has no effect.'
+            ' Use --ckpt-format to select the checkpoint format.'
+        )
     if args.dist_ckpt_format_deprecated and args.rank == 0:
-        print('--dist-ckpt-format is deprecated and has no effect.'
-              ' Use --ckpt-format to select the checkpoint format.')
+        print(
+            '--dist-ckpt-format is deprecated and has no effect.'
+            ' Use --ckpt-format to select the checkpoint format.'
+        )
 
     if args.use_dist_ckpt and args.ckpt_fully_parallel_load:
         if args.ckpt_fully_parallel_load_exchange_algo != "broadcast":
@@ -1537,14 +1775,13 @@ def validate_args(args, defaults={}):
 
     # Inference args
     if args.inference_batch_times_seqlen_threshold > -1:
-        assert args.pipeline_model_parallel_size > 1, \
-            "--inference-batch-times-seqlen-threshold requires setting --pipeline-model-parallel-size > 1."
+        assert (
+            args.pipeline_model_parallel_size > 1
+        ), "--inference-batch-times-seqlen-threshold requires setting --pipeline-model-parallel-size > 1."
         assert (
             args.cuda_graph_impl == "none"
         ), "Pipeline-parallel microbatched inference is incompatible with CUDA graphs"
 
-
-
     # MoE upcycling check
     if args.moe_use_upcycling:
         assert args.save is not None, "When using upcycling, the --save option must be specified."
@@ -1561,10 +1798,16 @@ def validate_args(args, defaults={}):
     if args.skip_train and not args.perform_rl_step and not args.no_load_optim:
         args.no_load_optim = True
         warn_rank_0('enabling --no-load-optim when skipping training.')
-    if args.skip_train and args.perform_rl_step and args.no_load_optim and args.rl_offload_optimizer_during_inference:
-        assert False, \
-            '--no-load-optim with --skip-train --perform-rl-step skips the optimizer; ' \
+    if (
+        args.skip_train
+        and args.perform_rl_step
+        and args.no_load_optim
+        and args.rl_offload_optimizer_during_inference
+    ):
+        assert False, (
+            '--no-load-optim with --skip-train --perform-rl-step skips the optimizer; '
             '--rl-offload-optimizer-during-inference is incompatible (no optimizer to offload).'
+        )
 
     # Optimizer CPU offload check
     if args.optimizer_cpu_offload:
@@ -1577,22 +1820,40 @@ def validate_args(args, defaults={}):
             "must be used in conjunction with `--fp8-recipe delayed`."
         )
 
+    if args.offload_optimizer_states:
+        assert (
+            args.use_distributed_optimizer
+        ), "offload_optimizer_states is only supported with distributed optimizer"
+        assert (
+            args.optimizer == 'adam'
+        ), "offload_optimizer_states is only supported with adam optimizer"
+        assert (
+            not args.use_megatron_fsdp
+        ), "offload_optimizer_states does not support Megatron-FSDP for now."
+
     if args.non_persistent_ckpt_type == "local":
-        assert args.non_persistent_local_ckpt_dir is not None, "Tried to use local checkpointing without specifying --local-ckpt-dir!"
+        assert (
+            args.non_persistent_local_ckpt_dir is not None
+        ), "Tried to use local checkpointing without specifying --local-ckpt-dir!"
     if args.replication:
-        assert args.replication_jump is not None, "--replication requires the value of --replication-jump!"
-        assert args.non_persistent_ckpt_type == "local", f"--replication requires args.non_persistent_ckpt_type == 'local', but got: {args.non_persistent_ckpt_type}"
+        assert (
+            args.replication_jump is not None
+        ), "--replication requires the value of --replication-jump!"
+        assert (
+            args.non_persistent_ckpt_type == "local"
+        ), f"--replication requires args.non_persistent_ckpt_type == 'local', but got: {args.non_persistent_ckpt_type}"
     elif args.replication_jump:
         warn_rank_0("--replication-jump was specified despite not using replication. Ignoring.")
         args.replication_jump = None
 
     if args.delay_wgrad_compute:
-        assert args.transformer_impl == 'transformer_engine', \
-            "Delaying wgrad compute is only supported with transformer_engine implementation"
+        assert (
+            args.transformer_impl == 'transformer_engine'
+        ), "Delaying wgrad compute is only supported with transformer_engine implementation"
         if args.overlap_grad_reduce:
-            assert is_te_min_version("2.8.0"), (
-                "overlap_grad_reduce is only supported with TE >= 2.8.0 when enabling delay_wgrad_compute"
-            )
+            assert is_te_min_version(
+                "2.8.0"
+            ), "overlap_grad_reduce is only supported with TE >= 2.8.0 when enabling delay_wgrad_compute"
             wgrad_in_graph_scope = CudaGraphScope.attn in args.cuda_graph_scope or (
                 CudaGraphScope.moe_router in args.cuda_graph_scope
                 and args.moe_shared_expert_intermediate_size is not None
@@ -1619,14 +1880,18 @@ def validate_args(args, defaults={}):
             )
 
     if args.fine_grained_activation_offloading:
-        assert args.transformer_impl == 'transformer_engine', \
-            "Fine-grained activation offloading is only supported with transformer_engine implementation"
+        assert (
+            args.transformer_impl == 'transformer_engine'
+        ), "Fine-grained activation offloading is only supported with transformer_engine implementation"
         if is_te_min_version("2.10.0"):
-            assert os.getenv("NVTE_CPU_OFFLOAD_V1", "0") == "1", \
-                "For fine-grained activation offloading with TE >= 2.10.0, NVTE_CPU_OFFLOAD_V1 should be set to 1 to avoid offloading weights."
+            assert (
+                os.getenv("NVTE_CPU_OFFLOAD_V1", "0") == "1"
+            ), "For fine-grained activation offloading with TE >= 2.10.0, NVTE_CPU_OFFLOAD_V1 should be set to 1 to avoid offloading weights."
 
     if args.mtp_num_layers:
-        assert not args.use_legacy_models, "The legacy Megatron models does not support Multi-Token Prediction (MTP)."
+        assert (
+            not args.use_legacy_models
+        ), "The legacy Megatron models does not support Multi-Token Prediction (MTP)."
         # MTP is compatible with position embedding types that use position_ids.
         supported_position_types = ["learned_absolute", "rope", "mrope", "none"]
         assert args.position_embedding_type in supported_position_types, (
@@ -1662,18 +1927,28 @@ def validate_args(args, defaults={}):
         warn_rank_0(
             'full scope is deprecated. Use empty cuda_graph_scope to capture the whole layer.'
         )
-    
+
     if args.multi_latent_attention:
-        assert not args.group_query_attention, "Group query attention is mutually exclusive with multi latent attention."
-        
+        assert (
+            not args.group_query_attention
+        ), "Group query attention is mutually exclusive with multi latent attention."
+
     if args.mla_down_proj_fusion:
-        assert args.multi_latent_attention, "--mla-down-proj-fusion requires --multi-latent-attention"
+        assert (
+            args.multi_latent_attention
+        ), "--mla-down-proj-fusion requires --multi-latent-attention"
 
     # MoE latent projections
     if args.moe_latent_size is not None:
-        assert args.moe_latent_size > 0, "MoE latent projection dimension has to be greater than zero."
-        assert args.num_experts is not None, "MoE latent projections are applicable only for MoE models."
-        assert not args.use_legacy_models, "MoE latent projections are only supported for mcore models."
+        assert (
+            args.moe_latent_size > 0
+        ), "MoE latent projection dimension has to be greater than zero."
+        assert (
+            args.num_experts is not None
+        ), "MoE latent projections are applicable only for MoE models."
+        assert (
+            not args.use_legacy_models
+        ), "MoE latent projections are only supported for mcore models."
 
     if args.tiktoken_special_tokens and not args.tokenizer_special_tokens:
         warn_rank_0(
@@ -1681,7 +1956,7 @@ def validate_args(args, defaults={}):
             "Use --tokenizer-special-tokens instead."
         )
         args.tokenizer_special_tokens = args.tiktoken_special_tokens
-    
+
     if args.tokenizer_hf_use_fast:
         warn_rank_0(
             "--tokenizer-hf-use-fast argument is deprecated and will be removed soon. "
@@ -1705,6 +1980,7 @@ def validate_args(args, defaults={}):
 def _print_args(title, args):
     """Print arguments."""
     from megatron.training.utils import is_rank0
+
     if is_rank0():
         print(f'------------------------ {title} ------------------------', flush=True)
         str_list = []
@@ -1729,7 +2005,9 @@ def core_transformer_config_from_args(args, config_class=None):
         config_class = MLATransformerConfig
 
     if args.heterogeneous_layers_config_path is not None:
-        assert not args.multi_latent_attention, "Multi latent attention with heterogeneous layers is not supported."
+        assert (
+            not args.multi_latent_attention
+        ), "Multi latent attention with heterogeneous layers is not supported."
         config_class = HeterogeneousTransformerConfig
 
     # Translate args to core transformer configuration
@@ -1742,9 +2020,10 @@ def core_transformer_config_from_args(args, config_class=None):
     kw_args['pipeline_dtype'] = args.params_dtype
     kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm
     kw_args['num_moe_experts'] = args.num_experts
+    kw_args['actual_vocab_size'] = args.padded_vocab_size
     kw_args['rotary_interleaved'] = args.rotary_interleaved
-    kw_args['num_layers_in_first_pipeline_stage']= args.decoder_first_pipeline_num_layers
-    kw_args['num_layers_in_last_pipeline_stage']= args.decoder_last_pipeline_num_layers
+    kw_args['num_layers_in_first_pipeline_stage'] = args.decoder_first_pipeline_num_layers
+    kw_args['num_layers_in_last_pipeline_stage'] = args.decoder_last_pipeline_num_layers
     kw_args['fp8_param'] = args.fp8_param_gather
     kw_args['fp4_param'] = args.fp4_param_gather
     if args.swiglu:
@@ -1772,15 +2051,16 @@ def core_transformer_config_from_args(args, config_class=None):
         # Pop 'rope_type' to let the config class use the default value.
         kw_args.pop('rope_type', None)
     else:
-        assert (args.multi_latent_attention or args.rope_type == 'rope'), (
-            f'Common attention only support rope_type="rope", but got {args.rope_type}.'
-        )
+        assert (
+            args.multi_latent_attention or args.rope_type == 'rope'
+        ), f'Common attention only support rope_type="rope", but got {args.rope_type}.'
 
     if len(args.cp_comm_type) == 1:
         kw_args['cp_comm_type'] = args.cp_comm_type[0]
     if args.hybrid_layer_pattern is not None:
         kw_args['is_hybrid_model'] = True
         from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols
+
         if Symbols.DS_ATTENTION in args.hybrid_layer_pattern:
             kw_args['experimental_attention_variant'] = 'dsa'
 
@@ -1816,209 +2096,339 @@ def _add_transformer_engine_args(parser):
     group = parser.add_argument_group(title='Transformer-Engine')
 
     # FP4 related arguments
-    group.add_argument('--fp4-param-gather', action='store_true',
-                       help='Keep the compute param in fp4 (do not use any other intermediate '
-                            'dtype) and perform the param all-gather in fp4.')
+    group.add_argument(
+        '--fp4-param-gather',
+        action='store_true',
+        help='Keep the compute param in fp4 (do not use any other intermediate '
+        'dtype) and perform the param all-gather in fp4.',
+    )
     # FP8 related arguments
-    group.add_argument('--fp8-param-gather', action='store_true',
-                       help='Keep the compute param in fp8 (do not use any other intermediate '
-                            'dtype) and perform the param all-gather in fp8.')
+    group.add_argument(
+        '--fp8-param-gather',
+        action='store_true',
+        help='Keep the compute param in fp8 (do not use any other intermediate '
+        'dtype) and perform the param all-gather in fp8.',
+    )
     # TE precision config file
-    group.add_argument('--te-precision-config-file', default=None,
-                       help='Configuration file to select per-module precision overrides. '
-                       'See TransformerEngineMixedPrecision.md')
+    group.add_argument(
+        '--te-precision-config-file',
+        default=None,
+        help='Configuration file to select per-module precision overrides. '
+        'See TransformerEngineMixedPrecision.md',
+    )
     return parser
 
+
 def _add_inference_args(parser):
     group = parser.add_argument_group(title='inference')
 
-    group.add_argument('--inference-batch-times-seqlen-threshold',
-                       type=int, default=-1,
-                       help='If (batch-size * sequence-length) is smaller than this threshold'
-                       'then batches will not be split up for pipelining.'
-                       'Requires setting --pipeline-model-parallel-size > 1.'
-                       'Setting this to -1 indicates that batch pipelining is not used.')
-    group.add_argument('--max-tokens-to-oom',
-                       type=int, default=12000,
-                       help='Maximum number of tokens during inference'
-                       'tokens here is # in prompt + # to generate'
-                       'Allows us to throw an error before OOM crashes server')
-    group.add_argument('--output-bert-embeddings', action='store_true',
-                       help='Output Bert embeddings (via mean pooling) from '
-                       'model, rather than its binary head output or entire '
-                       'hidden batch.')
-    group.add_argument('--bert-embedder-type', default="megatron",
-                       choices=["megatron", "huggingface"],
-                       help='Select either Megatron or Huggingface as the '
-                       'Bert embedder.')
-    group.add_argument('--cuda-graph-scope', nargs='+', type=lambda scope: CudaGraphScope[scope] if scope != "full" else scope, default=[],
-                       help='Determines the CUDA graphs capturing scope. '
-                       'choices: "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba", "full_iteration". '
-                       '"attn": captures operations in TransformerLayer._forward_attention(). '
-                       '"mlp": captures operations in TransformerLayer._forward_mlp() for a dense layer. '
-                       '"moe": captures operations in TransformerLayer._forward_mlp() for a MoE layer. '
-                       '"moe_router": captures operations in TransformerLayer._forward_mlp() up to MoELayer.router(), '
-                       'including the shared experts if they are not overlapped with EP comm. '
-                       '"moe_preprocess": captures operations in MoELayer.preprocess(). Must be used together with "moe_router". '
-                       '"mamba": captures the mamba layer. '
-                       '"full_iteration": captures a whole training iteration. '
-                       '"full_iteration_inference": captures a whole inference iteration. '
-                       'full_iteration and full_iteration_inference scopes are only supported with --cuda-graph-impl=local, other scopes are only supported with --cuda-graph-impl=transformer_engine. '
-                       'If not specified, the default scope is to capture the whole Transformer layer. '
-                       'For backward compatibility, we still allow passing "full" to specify capturing the whole layer, and convert it to an empty list.')
-    group.add_argument('--use-legacy-static-engine', action='store_true', default=False,
-                       help='Use legacy static engine. (Current static engine uses dynamic engine under the hood)',
-                       dest='use_legacy_static_engine')
-    group.add_argument('--inference-max-requests', type=int, default=8,
-                       help='Maximum number of requests for inference.',
-                       dest='inference_max_requests')
-    group.add_argument('--inference-max-seq-length', type=int, default=2560,
-                       help='Maximum sequence length expected for inference (prefill + decode).',
-                       dest='inference_max_seq_length')
-    group.add_argument('--inference-dynamic-batching',
-                       action='store_true', default=False,
-                       help='Enable dynamic batching mode.')
-    group.add_argument('--inference-dynamic-batching-buffer-size-gb',
-                       type=float, default=40.,
-                       help='Amount of on-GPU memory allocated for the KV cache. '
-                       'The total amount of memory allocated for the KV cache '
-                       '(CPU + GPU memory) depends on the value set for the '
-                       'unified virtual memory (UVM) level (via '
-                       '`--inference-dynamic-batching-unified-memory-level`).'
-                       'If the UVM level is 0, then only GPU memory is used and '
-                       'the total memory equals `buffer_size_gb`. If the UVM '
-                       'level is 1, then additional memory is utilized on the '
-                       'CPU and the total memory equals `buffer_size_gb + '
-                       'paused_buffer_size_gb`.')
-    group.add_argument('--inference-dynamic-batching-paused-buffer-size-gb',
-                       type=float, default=None,
-                       help='Amount of memory reserved for paused requests in '
-                       'the dynamic inference context. Active requests are '
-                       'paused when there are not enough active blocks available '
-                       'to continue generating a request.')
-    group.add_argument('--inference-dynamic-batching-mamba-memory-ratio', type=float, default=None,
-                       help='Percentage of memory buffer to allocate for Mamba states. '
-                       'If not specified, allocates Mamba state tensors for each KV cache block. '
-                       'Only used for hybrid models.')
-    group.add_argument('--inference-dynamic-batching-block-size',
-                       type=int, default=256,
-                       help='KV cache block size. '
-                       'It should be a multiple of 256')
-    group.add_argument('--inference-dynamic-batching-max-requests',
-                       type=int, default=None,
-                       help='Override the inference context\'s `max_requests`. '
-                       'By default, `max_requests` is set to the number of '
-                       'blocks in the context\'s memory buffer.')
-    group.add_argument('--inference-dynamic-batching-max-tokens',
-                       type=int, default=None,
-                       help='Override the inference context\'s default `max_tokens`.')
-    group.add_argument('--inference-dynamic-batching-num-cuda-graphs',
-                       type=int, default=16,
-                       help='Maximum number of cuda graphs to capture, where the '
-                       'cuda graph batch sizes range from 1 to `max_requests`. '
-                       '(See `dynamic_context.py` for details on how '
-                       '`max_requests` is computed). Due to rounding, the actual '
-                       'number of cuda graphs may not equal this argument.'
-                       'The user can also pass -1, in which case we automatically determine the number of graphs ' \
-                       'to capture based on the `max_requests`.')
-    group.add_argument('--inference-dynamic-batching-track-paused-request-events',
-                       action='store_true',
-                       help='Track paused request ids by adding \'paused\' events '
-                       'to each request\'s event history. This has a very minor '
-                       'impact on latency.')
-    group.add_argument('--inference-dynamic-batching-track-generated-token-events',
-                       action='store_true',
-                       help='Track per-token events with timestamps for each generated token. '
-                       'When enabled, each generated token creates a GENERATED_TOKEN event '
-                       'with a timestamp, useful for per-token latency analysis.')
-    group.add_argument('--decode-only-cuda-graphs',
-                       action='store_true', default=False,
-                       help='Only use cuda graphs for decode-only steps, not prefill and mixed steps.')
-    group.add_argument('--inference-dynamic-batching-unified-memory-level',
-                       type=int, default=0, choices=[0, 1],
-                       help='Set unified memory usage within the dynamic '
-                       'inference context. The levels are: 0) no unified memory, '
-                       '1) allocate `memory_buffer` in unified memory. '
-                       'Eventually, additional levels will be included to '
-                       'control other tensors within the context.')
-    group.add_argument('--enable-chunked-prefill', dest='enable_chunked_prefill',
-                       action='store_true', default=False,
-                       help="Enable chunked prefill (disabled by default)")
-    group.add_argument('--num-speculative-tokens', type=int, default=0,
-                       help='Number of speculative tokens generated during decode')
-    group.add_argument('--inference-dynamic-batching-prefix-caching',
-                       dest='inference_dynamic_batching_enable_prefix_caching',
-                       action=argparse.BooleanOptionalAction,
-                       default=False,
-                       help='Enable/disable prefix caching for dynamic batching inference. '
-                       'When disabled, KV cache blocks cannot be shared between '
-                       'requests with identical prompt prefixes.')
-    group.add_argument('--inference-dynamic-batching-prefix-caching-eviction-policy',
-                       type=str, default='ref_zero',
-                       choices=['ref_zero', 'lru'],
-                       dest='inference_dynamic_batching_prefix_caching_eviction_policy',
-                       help='Eviction policy for prefix caching blocks. '
-                       '"ref_zero" (default) immediately returns blocks to the '
-                       'free pool when ref_count hits 0. "lru" keeps blocks '
-                       'cached and evicts via LRU only when space is needed.')
-    group.add_argument('--inference-dynamic-batching-prefix-caching-coordinator-policy',
-                       type=str, default='first_prefix_block',
-                       choices=['longest_prefix', 'first_prefix_block', 'round_robin'],
-                       dest='inference_dynamic_batching_prefix_caching_coordinator_policy',
-                       help='Coordinator routing policy for prefix caching. '
-                       '"first_prefix_block" (default) routes based on the first '
-                       'block hash only. "longest_prefix" routes to the rank with '
-                       'the longest matching prefix. "round_robin" ignores prefix '
-                       'affinity and cycles through ranks.')
-    group.add_argument('--inference-dynamic-batching-prefix-caching-routing-alpha',
-                       type=float, default=0.5,
-                       dest='inference_dynamic_batching_prefix_caching_routing_alpha',
-                       help='Weight for prefix-aware routing score: '
-                       'score = alpha * match + (1 - alpha) * normalized_load. '
-                       'Higher alpha favors prefix cache hits; lower alpha '
-                       'favors load balance. Default: 0.5.')
-    group.add_argument('--inference-dynamic-batching-prefix-caching-mamba-gb',
-                       type=float, default=None,
-                       dest='inference_dynamic_batching_prefix_caching_mamba_gb',
-                       help='GPU memory budget (in GB) for the Mamba state cache '
-                       'used by prefix caching on hybrid models. When set, Mamba '
-                       'states at block boundaries are cached for reuse.')
-    group.add_argument('--inference-dynamic-batching-cuda-graph-max-tokens',
-                       type=int, default=16384,
-                       help='Maximum number of tokens to capture in a cuda graph.')
-    group.add_argument('--inference-dynamic-batching-cuda-graph-mixed-prefill-count',
-                       type=int, default=16,
-                       help='Number of mixed prefill requests to capture in a cuda graph.')
-    group.add_argument('--inference-logging-step-interval', type=int, default=0,
-                       help='Step interval for logging inference metrics. '
-                            'Default to 0 to disable inference logging.')
-    group.add_argument('--inference-text-gen-server-logging', action=argparse.BooleanOptionalAction,
-                       required=False, default=False,
-                       help='Enable per-request logging in the inference text generation server.')
-    group.add_argument('--inference-wandb-logging', action=argparse.BooleanOptionalAction,
-                       required=False, default=False, help='Enable inference wandb logging.')
-    group.add_argument("--inference-coordinator-port", type=int,
-                       help="This port will be used to setup the inference coordinator on node-0")
-    group.add_argument('--mamba-inference-conv-states-dtype', type=str,
-                       choices=['bf16', 'fp16', 'fp32'], default='bf16',
-                       help='Dtype for the Mamba inference conv states tensor')
-    group.add_argument('--mamba-inference-ssm-states-dtype', type=str,
-                       choices=['bf16', 'fp16', 'fp32'], default='bf16',
-                       help='Dtype for the Mamba inference SSM states tensor')
-    group.add_argument('--inference-use-synchronous-zmq-collectives', action=argparse.BooleanOptionalAction,
-                       required=False, default=False, help='Use synchronous ZMQ collectives for inference. Helps in reducing performance variability for MoEs.')
-    return parser
-
-
-def _add_network_size_args(parser):
-    exclude = [
-        # cannot provide callables over CLI
-        "timers",
-        "finalize_model_grads_func",
-        "grad_scale_func",
-        "no_sync_func",
-        "grad_sync_func",
-        "param_sync_func",
+    group.add_argument(
+        '--inference-batch-times-seqlen-threshold',
+        type=int,
+        default=-1,
+        help='If (batch-size * sequence-length) is smaller than this threshold'
+        'then batches will not be split up for pipelining.'
+        'Requires setting --pipeline-model-parallel-size > 1.'
+        'Setting this to -1 indicates that batch pipelining is not used.',
+    )
+    group.add_argument(
+        '--max-tokens-to-oom',
+        type=int,
+        default=12000,
+        help='Maximum number of tokens during inference'
+        'tokens here is # in prompt + # to generate'
+        'Allows us to throw an error before OOM crashes server',
+    )
+    group.add_argument(
+        '--output-bert-embeddings',
+        action='store_true',
+        help='Output Bert embeddings (via mean pooling) from '
+        'model, rather than its binary head output or entire '
+        'hidden batch.',
+    )
+    group.add_argument(
+        '--bert-embedder-type',
+        default="megatron",
+        choices=["megatron", "huggingface"],
+        help='Select either Megatron or Huggingface as the ' 'Bert embedder.',
+    )
+    group.add_argument(
+        '--cuda-graph-scope',
+        nargs='+',
+        type=lambda scope: CudaGraphScope[scope] if scope != "full" else scope,
+        default=[],
+        help='Determines the CUDA graphs capturing scope. '
+        'choices: "attn", "mlp", "moe", "moe_router", "moe_preprocess", "mamba", "full_iteration". '
+        '"attn": captures operations in TransformerLayer._forward_attention(). '
+        '"mlp": captures operations in TransformerLayer._forward_mlp() for a dense layer. '
+        '"moe": captures operations in TransformerLayer._forward_mlp() for a MoE layer. '
+        '"moe_router": captures operations in TransformerLayer._forward_mlp() up to MoELayer.router(), '
+        'including the shared experts if they are not overlapped with EP comm. '
+        '"moe_preprocess": captures operations in MoELayer.preprocess(). Must be used together with "moe_router". '
+        '"mamba": captures the mamba layer. '
+        '"full_iteration": captures a whole training iteration. '
+        '"full_iteration_inference": captures a whole inference iteration. '
+        'full_iteration and full_iteration_inference scopes are only supported with --cuda-graph-impl=local, other scopes are only supported with --cuda-graph-impl=transformer_engine. '
+        'If not specified, the default scope is to capture the whole Transformer layer. '
+        'For backward compatibility, we still allow passing "full" to specify capturing the whole layer, and convert it to an empty list.',
+    )
+    group.add_argument(
+        '--use-legacy-static-engine',
+        action='store_true',
+        default=False,
+        help='Use legacy static engine. (Current static engine uses dynamic engine under the hood)',
+        dest='use_legacy_static_engine',
+    )
+    group.add_argument(
+        '--inference-max-requests',
+        type=int,
+        default=8,
+        help='Maximum number of requests for inference.',
+        dest='inference_max_requests',
+    )
+    group.add_argument(
+        '--inference-max-seq-length',
+        type=int,
+        default=2560,
+        help='Maximum sequence length expected for inference (prefill + decode).',
+        dest='inference_max_seq_length',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching',
+        action='store_true',
+        default=False,
+        help='Enable dynamic batching mode.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-buffer-size-gb',
+        type=float,
+        default=40.0,
+        help='Amount of on-GPU memory allocated for the KV cache. '
+        'The total amount of memory allocated for the KV cache '
+        '(CPU + GPU memory) depends on the value set for the '
+        'unified virtual memory (UVM) level (via '
+        '`--inference-dynamic-batching-unified-memory-level`).'
+        'If the UVM level is 0, then only GPU memory is used and '
+        'the total memory equals `buffer_size_gb`. If the UVM '
+        'level is 1, then additional memory is utilized on the '
+        'CPU and the total memory equals `buffer_size_gb + '
+        'paused_buffer_size_gb`.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-paused-buffer-size-gb',
+        type=float,
+        default=None,
+        help='Amount of memory reserved for paused requests in '
+        'the dynamic inference context. Active requests are '
+        'paused when there are not enough active blocks available '
+        'to continue generating a request.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-mamba-memory-ratio',
+        type=float,
+        default=None,
+        help='Percentage of memory buffer to allocate for Mamba states. '
+        'If not specified, allocates Mamba state tensors for each KV cache block. '
+        'Only used for hybrid models.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-block-size',
+        type=int,
+        default=256,
+        help='KV cache block size. ' 'It should be a multiple of 256',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-max-requests',
+        type=int,
+        default=None,
+        help='Override the inference context\'s `max_requests`. '
+        'By default, `max_requests` is set to the number of '
+        'blocks in the context\'s memory buffer.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-max-tokens',
+        type=int,
+        default=None,
+        help='Override the inference context\'s default `max_tokens`.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-num-cuda-graphs',
+        type=int,
+        default=16,
+        help='Maximum number of cuda graphs to capture, where the '
+        'cuda graph batch sizes range from 1 to `max_requests`. '
+        '(See `dynamic_context.py` for details on how '
+        '`max_requests` is computed). Due to rounding, the actual '
+        'number of cuda graphs may not equal this argument.'
+        'The user can also pass -1, in which case we automatically determine the number of graphs '
+        'to capture based on the `max_requests`.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-track-paused-request-events',
+        action='store_true',
+        help='Track paused request ids by adding \'paused\' events '
+        'to each request\'s event history. This has a very minor '
+        'impact on latency.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-track-generated-token-events',
+        action='store_true',
+        help='Track per-token events with timestamps for each generated token. '
+        'When enabled, each generated token creates a GENERATED_TOKEN event '
+        'with a timestamp, useful for per-token latency analysis.',
+    )
+    group.add_argument(
+        '--decode-only-cuda-graphs',
+        action='store_true',
+        default=False,
+        help='Only use cuda graphs for decode-only steps, not prefill and mixed steps.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-unified-memory-level',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help='Set unified memory usage within the dynamic '
+        'inference context. The levels are: 0) no unified memory, '
+        '1) allocate `memory_buffer` in unified memory. '
+        'Eventually, additional levels will be included to '
+        'control other tensors within the context.',
+    )
+    group.add_argument(
+        '--enable-chunked-prefill',
+        dest='enable_chunked_prefill',
+        action='store_true',
+        default=False,
+        help="Enable chunked prefill (disabled by default)",
+    )
+    group.add_argument(
+        '--num-speculative-tokens',
+        type=int,
+        default=0,
+        help='Number of speculative tokens generated during decode',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-prefix-caching',
+        dest='inference_dynamic_batching_enable_prefix_caching',
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help='Enable/disable prefix caching for dynamic batching inference. '
+        'When disabled, KV cache blocks cannot be shared between '
+        'requests with identical prompt prefixes.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-prefix-caching-eviction-policy',
+        type=str,
+        default='ref_zero',
+        choices=['ref_zero', 'lru'],
+        dest='inference_dynamic_batching_prefix_caching_eviction_policy',
+        help='Eviction policy for prefix caching blocks. '
+        '"ref_zero" (default) immediately returns blocks to the '
+        'free pool when ref_count hits 0. "lru" keeps blocks '
+        'cached and evicts via LRU only when space is needed.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-prefix-caching-coordinator-policy',
+        type=str,
+        default='first_prefix_block',
+        choices=['longest_prefix', 'first_prefix_block', 'round_robin'],
+        dest='inference_dynamic_batching_prefix_caching_coordinator_policy',
+        help='Coordinator routing policy for prefix caching. '
+        '"first_prefix_block" (default) routes based on the first '
+        'block hash only. "longest_prefix" routes to the rank with '
+        'the longest matching prefix. "round_robin" ignores prefix '
+        'affinity and cycles through ranks.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-prefix-caching-routing-alpha',
+        type=float,
+        default=0.5,
+        dest='inference_dynamic_batching_prefix_caching_routing_alpha',
+        help='Weight for prefix-aware routing score: '
+        'score = alpha * match + (1 - alpha) * normalized_load. '
+        'Higher alpha favors prefix cache hits; lower alpha '
+        'favors load balance. Default: 0.5.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-prefix-caching-mamba-gb',
+        type=float,
+        default=None,
+        dest='inference_dynamic_batching_prefix_caching_mamba_gb',
+        help='GPU memory budget (in GB) for the Mamba state cache '
+        'used by prefix caching on hybrid models. When set, Mamba '
+        'states at block boundaries are cached for reuse.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-cuda-graph-max-tokens',
+        type=int,
+        default=16384,
+        help='Maximum number of tokens to capture in a cuda graph.',
+    )
+    group.add_argument(
+        '--inference-dynamic-batching-cuda-graph-mixed-prefill-count',
+        type=int,
+        default=16,
+        help='Number of mixed prefill requests to capture in a cuda graph.',
+    )
+    group.add_argument(
+        '--inference-logging-step-interval',
+        type=int,
+        default=0,
+        help='Step interval for logging inference metrics. '
+        'Default to 0 to disable inference logging.',
+    )
+    group.add_argument(
+        '--inference-text-gen-server-logging',
+        action=argparse.BooleanOptionalAction,
+        required=False,
+        default=False,
+        help='Enable per-request logging in the inference text generation server.',
+    )
+    group.add_argument(
+        '--inference-wandb-logging',
+        action=argparse.BooleanOptionalAction,
+        required=False,
+        default=False,
+        help='Enable inference wandb logging.',
+    )
+    group.add_argument(
+        "--inference-coordinator-port",
+        type=int,
+        help="This port will be used to setup the inference coordinator on node-0",
+    )
+    group.add_argument(
+        '--mamba-inference-conv-states-dtype',
+        type=str,
+        choices=['bf16', 'fp16', 'fp32'],
+        default='bf16',
+        help='Dtype for the Mamba inference conv states tensor',
+    )
+    group.add_argument(
+        '--mamba-inference-ssm-states-dtype',
+        type=str,
+        choices=['bf16', 'fp16', 'fp32'],
+        default='bf16',
+        help='Dtype for the Mamba inference SSM states tensor',
+    )
+    group.add_argument(
+        '--inference-use-synchronous-zmq-collectives',
+        action=argparse.BooleanOptionalAction,
+        required=False,
+        default=False,
+        help='Use synchronous ZMQ collectives for inference. Helps in reducing performance variability for MoEs.',
+    )
+    return parser
+
+
+def _add_network_size_args(parser):
+    exclude = [
+        # cannot provide callables over CLI
+        "timers",
+        "finalize_model_grads_func",
+        "grad_scale_func",
+        "no_sync_func",
+        "grad_sync_func",
+        "param_sync_func",
         "_cpu_offloading_context",
         "init_method",
         "output_layer_init_method",
@@ -2031,6 +2441,7 @@ def _add_network_size_args(parser):
         "no_rope_freq",
         "moe_layer_freq",
         "linear_attention_freq",
+        "csa_compress_ratios",
         "moe_router_load_balancing_type",
         "moe_aux_loss_coeff",
         "cp_comm_type",
@@ -2085,6 +2496,7 @@ def _add_network_size_args(parser):
         "barrier_with_L1_time",
         # args uses same var with a different name
         "num_moe_experts",
+        "actual_vocab_size",
         "fp8_param",
         "fp4_param",
         # incompatible defaults in dataclass
@@ -2101,81 +2513,159 @@ def _add_network_size_args(parser):
 
     group = parser.add_argument_group(title='network size')
 
-    group.add_argument('--encoder-num-layers', type=int, default=None,
-                       help='Number of encoder transformer layers.')
-    group.add_argument('--decoder-num-layers', type=int, default=None,
-                       help='Number of decoder transformer layers.')
-    group.add_argument('--group-query-attention', action='store_true',
-                          help='Use group-query attention.')
-    group.add_argument('--window-size', type=tuple_type, default=None,
-                       help='Window size for window attention. If not provided, '
-                            'window attention will be disabled.')
-    group.add_argument('--window-attn-skip-freq', type=moe_freq_type, default=None,
-                       help='Frequency of layers to skip window attention. Accepts either: '
-                            '- An integer N: Represents a (N-1):1 ratio, meaning one full attention layer '
-                            'after (N-1) SWA layers. '
-                            '- A string containing a Python list expression that defines a custom pattern, '
-                            'e.g.: "[1,1,1,0]*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] '
-                            'where 1 indicates SWA and 0 indicates full attention. ')
-    group.add_argument('--max-position-embeddings', type=int, default=None,
-                       help='Maximum number of position embeddings to use. '
-                       'This is the size of position embedding.')
-    group.add_argument('--position-embedding-type', type=str, default='learned_absolute',
-                        choices=['learned_absolute', 'rope', 'mrope', 'relative', 'none'],
-                        help='Position embedding type.')
-    group.add_argument('--relative-attention-num-buckets', type=int, default=32,
-                        help='Number of buckets for relative position embeddings.')
-    group.add_argument('--relative-attention-max-distance', type=int, default=128,
-                        help='Maximum distance for relative position embeddings calculation.')
-    group.add_argument('--use-rotary-position-embeddings', action='store_true',
-                       help='Use rotary positional embeddings or not. '
-                       'Deprecated: use --position-embedding-type')
-    group.add_argument('--rotary-base', type=int, default=10000,
-                       help='Base to use for rotary positional embeddings, default 10000')
-    group.add_argument('--rotary-percent', type=float, default=1.0,
-                       help='Percent of rotary dimension to use, default 100%%')
-    group.add_argument('--rotary-seq-len-interpolation-factor', type=int, default=None,
-                       help='Sequence length interpolation factor for rotary embeddings.')
-    group.add_argument('--use-rope-scaling', action='store_true',
-                       help='Apply rope scaling as used in llama3.x')
-    group.add_argument('--rope-scaling-factor', type=float, default=8.0,
-                       help='Rope scaling factor in llama3.x models')
-    group.add_argument('--no-rope-freq', type=no_rope_freq_type, default=None,
-                       help='Controls which layers to skip performing Rotary Position Embedding. Accepts either: '
-                            '- An integer N: Represents a 1:N ratio, meaning RoPE is skipped every N-1 layers. '
-                            '- A string containing a Python list expression that defines a custom pattern, e.g.: '
-                            '"([0]*3+[1]*1)*3" evaluates to [0,0,0,1,0,0,0,1,0,0,0,1] '
-                            'where 1 indicates no-rope layer. This patten is equivalent to --no-rope-freq=4.'
-                            'By default this is disabled and set to None, indicating RoPE will be performed'
-                            'on every layer.'
-                       )
-    group.add_argument('--no-position-embedding',
-                       action='store_false',
-                       help='Disable position embedding. Deprecated: use --position-embedding-type',
-                       dest='add_position_embedding')
-    group.add_argument('--make-vocab-size-divisible-by', type=int, default=128,
-                       help='Pad the vocab size to be divisible by this value.'
-                       'This is added for computational efficieny reasons.')
-    group.add_argument('--openai-gelu', action='store_true',
-                       help='Use OpenAIs GeLU implementation. This option'
-                       'should not be used unless for backward compatibility'
-                       'reasons.')
-    group.add_argument('--squared-relu', action='store_true',
-                       help='Use squared relu activation instead of default gelu')
-    group.add_argument('--swiglu', action='store_true',
-                       help='Use gated linear units and SiLU activation instead of default gelu')
-    group.add_argument('--quick-geglu', action='store_true',
-                       help='Use quick geglu activation instead of default gelu')
-    group.add_argument('--onnx-safe', type=bool, required=False,
-                       help='Use workarounds for known problems with '
-                       'Torch ONNX exporter')
-    group.add_argument('--bert-no-binary-head', action='store_false',
-                       help='Disable BERT binary head.',
-                       dest='bert_binary_head')
-    group.add_argument('--untie-embeddings-and-output-weights', action='store_true',
-                       help='Untie embeddings and output weights.')
+    group.add_argument(
+        '--encoder-num-layers', type=int, default=None, help='Number of encoder transformer layers.'
+    )
+    group.add_argument(
+        '--decoder-num-layers', type=int, default=None, help='Number of decoder transformer layers.'
+    )
+    group.add_argument(
+        '--group-query-attention', action='store_true', help='Use group-query attention.'
+    )
+    group.add_argument(
+        '--window-size',
+        type=tuple_type,
+        default=None,
+        help='Window size for window attention. If not provided, '
+        'window attention will be disabled.',
+    )
+    group.add_argument(
+        '--window-attn-skip-freq',
+        type=moe_freq_type,
+        default=None,
+        help='Frequency of layers to skip window attention. Accepts either: '
+        '- An integer N: Represents a (N-1):1 ratio, meaning one full attention layer '
+        'after (N-1) SWA layers. '
+        '- A string containing a Python list expression that defines a custom pattern, '
+        'e.g.: "[1,1,1,0]*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] '
+        'where 1 indicates SWA and 0 indicates full attention. ',
+    )
+    group.add_argument(
+        '--max-position-embeddings',
+        type=int,
+        default=None,
+        help='Maximum number of position embeddings to use. '
+        'This is the size of position embedding.',
+    )
+    group.add_argument(
+        '--position-embedding-type',
+        type=str,
+        default='learned_absolute',
+        choices=['learned_absolute', 'rope', 'mrope', 'relative', 'none'],
+        help='Position embedding type.',
+    )
+    group.add_argument(
+        '--relative-attention-num-buckets',
+        type=int,
+        default=32,
+        help='Number of buckets for relative position embeddings.',
+    )
+    group.add_argument(
+        '--relative-attention-max-distance',
+        type=int,
+        default=128,
+        help='Maximum distance for relative position embeddings calculation.',
+    )
+    group.add_argument(
+        '--use-rotary-position-embeddings',
+        action='store_true',
+        help='Use rotary positional embeddings or not. '
+        'Deprecated: use --position-embedding-type',
+    )
+    group.add_argument(
+        '--rotary-base',
+        type=int,
+        default=10000,
+        help='Base to use for rotary positional embeddings, default 10000',
+    )
+    group.add_argument(
+        '--rotary-percent',
+        type=float,
+        default=1.0,
+        help='Percent of rotary dimension to use, default 100%%',
+    )
+    group.add_argument(
+        '--rotary-seq-len-interpolation-factor',
+        type=int,
+        default=None,
+        help='Sequence length interpolation factor for rotary embeddings.',
+    )
+    group.add_argument(
+        '--use-rope-scaling', action='store_true', help='Apply rope scaling as used in llama3.x'
+    )
+    group.add_argument(
+        '--rope-scaling-factor',
+        type=float,
+        default=8.0,
+        help='Rope scaling factor in llama3.x models',
+    )
+    group.add_argument(
+        '--no-rope-freq',
+        type=no_rope_freq_type,
+        default=None,
+        help='Controls which layers to skip performing Rotary Position Embedding. Accepts either: '
+        '- An integer N: Represents a 1:N ratio, meaning RoPE is skipped every N-1 layers. '
+        '- A string containing a Python list expression that defines a custom pattern, e.g.: '
+        '"([0]*3+[1]*1)*3" evaluates to [0,0,0,1,0,0,0,1,0,0,0,1] '
+        'where 1 indicates no-rope layer. This patten is equivalent to --no-rope-freq=4.'
+        'By default this is disabled and set to None, indicating RoPE will be performed'
+        'on every layer.',
+    )
+    group.add_argument(
+        '--no-position-embedding',
+        action='store_false',
+        help='Disable position embedding. Deprecated: use --position-embedding-type',
+        dest='add_position_embedding',
+    )
+    group.add_argument(
+        '--make-vocab-size-divisible-by',
+        type=int,
+        default=128,
+        help='Pad the vocab size to be divisible by this value.'
+        'This is added for computational efficieny reasons.',
+    )
+    group.add_argument(
+        '--openai-gelu',
+        action='store_true',
+        help='Use OpenAIs GeLU implementation. This option'
+        'should not be used unless for backward compatibility'
+        'reasons.',
+    )
+    group.add_argument(
+        '--squared-relu',
+        action='store_true',
+        help='Use squared relu activation instead of default gelu',
+    )
+    group.add_argument(
+        '--swiglu',
+        action='store_true',
+        help='Use gated linear units and SiLU activation instead of default gelu',
+    )
+    group.add_argument(
+        '--quick-geglu',
+        action='store_true',
+        help='Use quick geglu activation instead of default gelu',
+    )
+    group.add_argument(
+        '--onnx-safe',
+        type=bool,
+        required=False,
+        help='Use workarounds for known problems with ' 'Torch ONNX exporter',
+    )
+    group.add_argument(
+        '--bert-no-binary-head',
+        action='store_false',
+        help='Disable BERT binary head.',
+        dest='bert_binary_head',
+    )
+    group.add_argument(
+        '--untie-embeddings-and-output-weights',
+        action='store_true',
+        help='Untie embeddings and output weights.',
+    )
     return parser
 
+
 def _add_straggler_detector_args(parser):
     from megatron.training.config import StragglerDetectionConfig
 
@@ -2184,297 +2674,560 @@ def _add_straggler_detector_args(parser):
 
     return parser
 
+
 def _add_workload_inspector_server_args(parser):
     group = parser.add_argument_group(title='workload inspector')
-    group.add_argument('--run-workload-inspector-server', action='store_true',
-                       help='If set, enables workload inspector server for on-demand profiling.')
+    group.add_argument(
+        '--run-workload-inspector-server',
+        action='store_true',
+        help='If set, enables workload inspector server for on-demand profiling.',
+    )
     return parser
 
+
 def _add_inprocess_restart_args(parser):
     group = parser.add_argument_group(title='In-process restart')
 
-    group.add_argument('--inprocess-restart', action='store_true',
-                       help='Enables in-process restart.')
-
-    group.add_argument('--inprocess-max-iterations', default=None, type=int,
-                       help='Maximum number of in-process restart iterations.')
-    group.add_argument('--inprocess-monitor-thread-interval', default=1.0, type=float,
-                       help='Monitoring interval (in seconds) for the monitoring thread.')
-    group.add_argument('--inprocess-monitor-process-interval', default=1.0, type=float,
-                       help='Monitoring interval (in seconds) for the monitoring process.')
-    group.add_argument('--inprocess-progress-watchdog-interval', default=1.0, type=float,
-                       help='Interval (in seconds) for automatic progress watchdog timestamp '
-                       'updates.')
-    group.add_argument('--inprocess-heartbeat-interval', default=30, type=float,
-                       help='Monitoring interval (in seconds) for detecting unresponsive ranks.')
-
-    group.add_argument('--inprocess-soft-timeout', default=60, type=float,
-                       help='Soft progress timeout (in seconds).')
-    group.add_argument('--inprocess-hard-timeout', default=90, type=float,
-                       help='Hard progress timeout (in seconds).')
-    group.add_argument('--inprocess-heartbeat-timeout', default=60, type=float,
-                       help='Timeout (in seconds) for a missing rank detection heartbeat.')
-
-    group.add_argument('--inprocess-barrier-timeout', default=120, type=float,
-                       help='Timeout (in seconds) for internal distributed barrier')
-    group.add_argument('--inprocess-completion-timeout', default=120, type=float,
-                       help='Timeout (in seconds) for barrier on completion on all ranks')
-
-    group.add_argument('--inprocess-last-call-wait', default=1, type=float,
-                       help='Time interval (in seconds) for other ranks to report concurrent '
-                       'terminal failures.')
-    group.add_argument('--inprocess-termination-grace-time', default=1, type=float,
-                       help='Interval (in seconds) between SIGTERM and SIGKILL issued on hard '
-                       'timeout')
-
-    group.add_argument('--inprocess-granularity', default='node', type=str,
-                       choices=['node', 'rank'],
-                       help='Granularity for in-process restart.')
-    group.add_argument('--inprocess-active-world-size',
-                       default=int(os.getenv('WORLD_SIZE', '1')), type=int,
-                       help='The number of ranks initially executing the workload. '
-                       'The remaining ranks from the allocation are set aside '
-                       'as warm reserve.')
-    group.add_argument('--inprocess-empty-cuda-cache', action='store_true',
-                       help='Release all unoccupied cached GPU memory on every in-process restart.')
-    return parser
-
-def _add_one_logger_args(parser):
-    group = parser.add_argument_group(title='one logger')
-    group.add_argument('--no-one-logger', action='store_false',
-                       help='If set, disable using one_logger to track E2E metrics'
-                       'Note that one_logger is an internal tool and not '
-                       'available externally. For installation, please go to '
-                       'https://confluence.nvidia.com/display/MLWFO/Package+Repositories'
-                       'for more details',
-                       dest='enable_one_logger')
-    group.add_argument('--one-logger-project', type=str, default='megatron-lm',
-                       help='The one-logger project name. Will ignore if '
-                       '--no-one-logger is set')
-    group.add_argument('--one-logger-run-name', type=str, default=None,
-                       help='The one-logger run name displayed. Will ignore if '
-                       '--no-one-logger is set')
-    group.add_argument('--one-logger-async', action='store_true',
-                       help='If set, forces one_logger to use async mode.')
-    group.add_argument('--app-tag-run-name', type=str, default=None,
-                       help='Jobs belonging to same training run, suppose to '
-                       'have the same name. It will be used to track progress of '
-                       'a training done over multiple different jobs')
-    group.add_argument('--app-tag-run-version', type=str, default='0.0.0',
-                       help='The version of the training of which current job is '
-                       'part of. It will be used to track the changes in the '
-                       'application side which might change the performance '
-                       'baseline')
-    return parser
-
-
-def _add_ft_package_args(parser):
-    group = parser.add_argument_group(title='ft_package')
-    group.add_argument('--enable-ft-package', action='store_true',
-                       help='If set, Fault Tolerance package is enabled. '
-                       'Note: This feature is for Nvidia internal use only.')
-    group.add_argument('--calc-ft-timeouts', action='store_true',
-                       help='If set, FT package will try to automatically compute the timeouts. '
-                       'Note: This feature is for Nvidia internal use only.')
-    group.add_argument('--ft-num-warmup-iters', type=int, default=5,
-                       help='Number of warmup iterations before monitoring step section and '
-                       'out-of-section timeouts. The first N iterations are excluded from '
-                       'timeout monitoring as they can be significantly slower than steady-state. '
-                       'Default: 5. Note: This feature is for Nvidia internal use only.')
-    return parser
-
-
-def _add_logging_args(parser):
-    from megatron.training.config import LoggerConfig
-
-    log_factory = ArgumentGroupFactory(LoggerConfig, exclude = ["log_throughput_to_tensorboard", "throughput_window_size", "memory_keys", "log_l2_norm_grad_to_tensorboard", "log_runtime_to_tensorboard", "runtime_time_unit", "filter_warnings", "modules_to_filter", "set_level_for_all_loggers", "save_config_filepath"])
-    group = log_factory.build_group(parser, title="logging")
-
-    return parser
-
-
-def _add_regularization_args(parser):
-    group = parser.add_argument_group(title='regularization')
-
-    group.add_argument('--weight-decay', type=float, default=0.01,
-                       help='Weight decay coefficient for L2 regularization.')
-    group.add_argument('--apply-wd-to-qk-layernorm', action='store_true',
-                       help='Apply weight decay to qk layernorm as a special case.')
-    group.add_argument('--clip-grad', type=float, default=1.0,
-                       help='Gradient clipping based on global L2 norm.')
-    group.add_argument('--adam-beta1', type=float, default=0.9,
-                       help='First coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-beta2', type=float, default=0.999,
-                       help='Second coefficient for computing running averages '
-                       'of gradient and its square')
-    group.add_argument('--adam-eps', type=float, default=1e-08,
-                       help='Term added to the denominator to improve'
-                       'numerical stability')
-    group.add_argument('--sgd-momentum', type=float, default=0.9,
-                       help='Momentum factor for sgd')
-    group.add_argument('--muon-momentum', type=float, default=0.9,
-                       help='Momentum factor for Muon optimizer')
-    group.add_argument('--muon-no-split-qkv', action='store_false', default=True,
-                       dest='muon_split_qkv',
-                       help='Whether to split QKV parameters for Muon optimizer')
-    group.add_argument('--muon-nesterov', action='store_true',
-                       help='Whether to use Nesterov-style momentum in the internal SGD')
-    group.add_argument('--muon-scale-mode', type=str, default='spectral',
-                       choices=['spectral', 'unit_rms_norm', 'shape_scaling'],
-                       help='Scale mode for Muon optimizer. With MuP, set '
-                       '--muon-scale-mode unit_rms_norm to use unit_rms_norm scaling, '
-                       'or set --muon-scale-mode spectral to keep spectral scaling.')
-    group.add_argument('--muon-fp32-matmul-prec', type=str, default='medium',
-                       choices=['low', 'medium', 'high'],
-                       help='FP32 matmul precision for Newton-Schulz iteration')
-    group.add_argument('--muon-coefficient-type', type=str, default='quintic',
-                       help='Newton-Schulz coefficient type for the Muon optimizer. '
-                       'Valid types are discovered from the installed emerging_optimizers '
-                       'package (e.g. simple, quintic, polar_express, aol). '
-                       'Validated at optimizer creation time.')
-    group.add_argument('--muon-num-ns-steps', type=int, default=5,
-                       help='Number of Newton-Schulz steps for Muon optimizer')
-    group.add_argument('--muon-tp-mode', type=str, default='blockwise',
-                       choices=['blockwise', 'duplicated', 'distributed'],
-                       help='How to perform NS calculation for tensor model parallel weights')
-    group.add_argument('--muon-extra-scale-factor', type=float, default=1.0,
-                       help='Additional scale factor for the muon update')
-    group.add_argument('--muon-scalar-optimizer', type=str, default='adam',
-                       choices=['adam', 'lion'],
-                       help='Optimizer for scalar parameters (embeddings, biases, norms) '
-                       'when using muon. Defaults to adam.')
-    group.add_argument('--lion-beta1', type=float, default=0.95,
-                       help='First beta coefficient for Lion optimizer '
-                       '(used in sign update). Default: 0.95.')
-    group.add_argument('--lion-beta2', type=float, default=0.98,
-                       help='Second beta coefficient for Lion optimizer '
-                       '(used in momentum EMA update). Default: 0.98.')
-
-    group.add_argument('--no-weight-decay-cond-type', type=str, choices=['apply_wd_to_qk_layernorm'],
-                       help='Type of no weight decay condition. Choices: '
-                       'None (default): apply weight decay to 1D weights and biases.'
-                       '"apply_wd_to_qk_layernorm": additionally apply weight decay to '
-                       'qk layernorm as a special case.'
-                       'DEPRECATED. Please use --apply-wd-to-qk-layernorm instead. ')
-    return parser
-
+    group.add_argument(
+        '--inprocess-restart', action='store_true', help='Enables in-process restart.'
+    )
 
-def _add_rl_args(parser):
-    group = parser.add_argument_group(title='rl')
-    group.add_argument('--perform-rl-step', action='store_true',
-                       help="Use the RL training step.")
-    group.add_argument('--rl-prompts-per-eval', type=int, default=32,
-                       help='Number of prompts to evaluate for for each RL task.'
-                        'This evaluation can be very expensive when using environments'
-                        'that evaluate pass@k so we default to a lower number.')
-    # TODO(rkirby): allow for "complete" evaluation when --rl-prompts-per-eval is set to -1
-    group.add_argument('--grpo-prompts-per-step', type=int, default=32,
-                       help="Number of GRPO groups (G in the paper).")
-    group.add_argument('--grpo-group-size', type=int, default=2,
-                       help="Number of samples per a GRPO group.")
-    group.add_argument('--rl-num-parallel-generations', type=int, default=None,
-                       help='Number of rollouts being generated by the inference engine simultaneously. '
-                            'Internally divided by grpo_group_size. '
-                            'Requires --rl-partial-rollouts. '
-                            'Mutually exclusive with --rl-num-parallel-generation-batches.')
-    group.add_argument('--rl-num-parallel-generation-batches', type=int, default=None,
-                       help='Number of generation batches in flight. '
-                            'Set to L+1 to allow for L steps of staleness between the inference and training policies. '
-                            'Each batch contains grpo_prompts_per_step groups by default. '
-                            'Requires --rl-partial-rollouts. '
-                            'Mutually exclusive with --rl-num-parallel-generations.')
-    group.add_argument('--rl-generation-batch-size', type=int, default=None,
-                       help='Override the number of groups per generation batch. '
-                            'Defaults to grpo_prompts_per_step when '
-                            '--rl-num-parallel-generation-batches is set.')
-    group.add_argument('--grpo-iterations', type=int, default=2,
-                       help="Number of iterations per a GRPO implementation.")
-    # As in DAPO, we keep upper/lower eps different.
-    # To have a vanilla GRPO, set them to be the same.
-    group.add_argument('--grpo-clamp-eps-lower', type=float, default=0.01,
-                       help="Lower GRPO clipping bound.")
-    group.add_argument('--grpo-clamp-eps-upper', type=float, default=0.01,
-                       help="Upper GRPO clipping bound. In vanilla implementation, equals to the lower one.")
-    group.add_argument('--grpo-kl-beta', type=float, default=0.001,
-                       help="KL term weight in the GRPO loss.")
-    group.add_argument('--grpo-entropy-term-weight', type=float, default=0.0,
-                       help="Entropy term weight in GRPO loss.")
-    group.add_argument('--grpo-filter-groups-with-same-reward', action='store_true',
-                       help="Filter groups with same reward.")
-    group.add_argument('--langrl-env-config', type=str, default=None,
-                       help="Path to YAML config file for RL environment configuration.")
-    group.add_argument('--rl-default-temperature', type=float, default=1.0,
-                       help="Default temperature for model inference.")
-    group.add_argument('--rl-default-top-p', type=float, default=0,
-                       help="Default top-p for model inference.")
-    group.add_argument('--rl-default-top-k', type=int, default=-1,
-                       help="Default top-k for model inference.")
-    group.add_argument('--rl-offload-optimizer-during-inference', action='store_true',
-                       help='Offload optimizer state to CPU during inference/rollout to save GPU memory')
-    group.add_argument('--rl-kv-cache-management-mode', type=str, default='persist',
-                       choices=['persist', 'offload', 'recompute'],
-                       help='KV cache management mode during RL training: '
-                            'persist: leave KV cache in GPU memory (default), '
-                            'offload: offload KV cache to CPU during training, '
-                            'recompute: deallocate KV cache and recompute from scratch each cycle')
-    group.add_argument('--rl-persist-cuda-graphs', action=argparse.BooleanOptionalAction, type=bool, default=False,
-                       help='Persist CUDA graphs when the inference engine is suspended. '
-                            'If False, CUDA graphs are deleted on suspend and re-captured on resume.')
-    group.add_argument('--rl-partial-rollouts', action=argparse.BooleanOptionalAction, default=False,
-                       help='Allow inference to continue generating rollouts while training updates '
-                            'the policy weights. This enables off-policy training where rollouts may '
-                            'be generated with a stale version of the policy. Use '
-                            '--rl-num-parallel-generations or --rl-num-parallel-generation-batches '
-                            'to control the degree of staleness.')
-    group.add_argument('--rl-inference-logprobs-is-correction', action=argparse.BooleanOptionalAction, type=bool, default=False,
-                       help='If set, use inference logprobs in importance sampling correction of the loss.')
-    group.add_argument('--rl-importance-sampling-truncation-coef', type=float, default=None,
-                       help="If --inference-logprobs-is-correction is on and this coefficient is set, apply truncation for the IS correction at GRPO loss.")
-    group.add_argument('--rl-use-sequence-packing', action=argparse.BooleanOptionalAction, type=bool, default=False,
-                       help='Enable sequence packing')
-    group.add_argument('--rl-sequence-packing-max-sequences-per-bin', type=int, default=50,
-                       help='Maximum number of sequences that can be packed into a single bin. ')
-    group.add_argument('--rl-sequence-packing-algo', type=str, default='fifo',
-                       choices=['fifo', 'round-robin'],
-                       help='Algorithm for distributing packed bins across ranks. '
-                            'fifo: first-in-first-out sequential distribution, '
-                            'round-robin: distribute bins cyclically across ranks for better load balancing')
-    group.add_argument('--rl-training-cuda-graphs', action=argparse.BooleanOptionalAction, type=bool,
-                       default=False,
-                       help='If set, do not toggle CUDA graphs on/off between inference and training phases.')
-    group.add_argument('--rl-inference-tensor-model-parallel-size', type=int, default=None,
-                       help='Degree of tensor model parallelism for inference for RL.')     
     group.add_argument(
-        '--rl-inference-pipeline-model-parallel-size',
-        type=int,
+        '--inprocess-max-iterations',
         default=None,
-        help='Degree of pipeline model parallelism for inference for RL.',
+        type=int,
+        help='Maximum number of in-process restart iterations.',
     )
     group.add_argument(
-        '--rl-inference-expert-model-parallel-size',
-        type=int,
-        default=None,
-        help='Degree of expert model parallelism for inference for RL.',
+        '--inprocess-monitor-thread-interval',
+        default=1.0,
+        type=float,
+        help='Monitoring interval (in seconds) for the monitoring thread.',
     )
     group.add_argument(
-        '--rl-inference-expert-tensor-model-parallel-size',
-        type=int,
-        default=None,
-        help='Degree of expert tensor model parallelism for inference for RL. '
-             'For MoE models, this controls the TP size for expert layers specifically. '
-             'Defaults to training expert_tensor_parallel_size if not specified.',
+        '--inprocess-monitor-process-interval',
+        default=1.0,
+        type=float,
+        help='Monitoring interval (in seconds) for the monitoring process.',
     )
     group.add_argument(
-        '--rl-inference-model-unified-memory-level',
-        type=int,
-        default=0,
-        choices=[0, 1],
-        help=(
-            'Allocate the separate RL inference model parameters from a unified virtual memory (UVM) '
-            'CUDA mempool. Level 0 disables UVM (default). Level 1 enables UVM allocation so the '
-            'inference model weights can be prefetched to CPU when idle while keeping CUDA-graph-safe '
-            'device pointers.'
+        '--inprocess-progress-watchdog-interval',
+        default=1.0,
+        type=float,
+        help='Interval (in seconds) for automatic progress watchdog timestamp ' 'updates.',
+    )
+    group.add_argument(
+        '--inprocess-heartbeat-interval',
+        default=30,
+        type=float,
+        help='Monitoring interval (in seconds) for detecting unresponsive ranks.',
+    )
+
+    group.add_argument(
+        '--inprocess-soft-timeout',
+        default=60,
+        type=float,
+        help='Soft progress timeout (in seconds).',
+    )
+    group.add_argument(
+        '--inprocess-hard-timeout',
+        default=90,
+        type=float,
+        help='Hard progress timeout (in seconds).',
+    )
+    group.add_argument(
+        '--inprocess-heartbeat-timeout',
+        default=60,
+        type=float,
+        help='Timeout (in seconds) for a missing rank detection heartbeat.',
+    )
+
+    group.add_argument(
+        '--inprocess-barrier-timeout',
+        default=120,
+        type=float,
+        help='Timeout (in seconds) for internal distributed barrier',
+    )
+    group.add_argument(
+        '--inprocess-completion-timeout',
+        default=120,
+        type=float,
+        help='Timeout (in seconds) for barrier on completion on all ranks',
+    )
+
+    group.add_argument(
+        '--inprocess-last-call-wait',
+        default=1,
+        type=float,
+        help='Time interval (in seconds) for other ranks to report concurrent '
+        'terminal failures.',
+    )
+    group.add_argument(
+        '--inprocess-termination-grace-time',
+        default=1,
+        type=float,
+        help='Interval (in seconds) between SIGTERM and SIGKILL issued on hard ' 'timeout',
+    )
+
+    group.add_argument(
+        '--inprocess-granularity',
+        default='node',
+        type=str,
+        choices=['node', 'rank'],
+        help='Granularity for in-process restart.',
+    )
+    group.add_argument(
+        '--inprocess-active-world-size',
+        default=int(os.getenv('WORLD_SIZE', '1')),
+        type=int,
+        help='The number of ranks initially executing the workload. '
+        'The remaining ranks from the allocation are set aside '
+        'as warm reserve.',
+    )
+    group.add_argument(
+        '--inprocess-empty-cuda-cache',
+        action='store_true',
+        help='Release all unoccupied cached GPU memory on every in-process restart.',
+    )
+    return parser
+
+
+def _add_one_logger_args(parser):
+    group = parser.add_argument_group(title='one logger')
+    group.add_argument(
+        '--no-one-logger',
+        action='store_false',
+        help='If set, disable using one_logger to track E2E metrics'
+        'Note that one_logger is an internal tool and not '
+        'available externally. For installation, please go to '
+        'https://confluence.nvidia.com/display/MLWFO/Package+Repositories'
+        'for more details',
+        dest='enable_one_logger',
+    )
+    group.add_argument(
+        '--one-logger-project',
+        type=str,
+        default='megatron-lm',
+        help='The one-logger project name. Will ignore if ' '--no-one-logger is set',
+    )
+    group.add_argument(
+        '--one-logger-run-name',
+        type=str,
+        default=None,
+        help='The one-logger run name displayed. Will ignore if ' '--no-one-logger is set',
+    )
+    group.add_argument(
+        '--one-logger-async',
+        action='store_true',
+        help='If set, forces one_logger to use async mode.',
+    )
+    group.add_argument(
+        '--app-tag-run-name',
+        type=str,
+        default=None,
+        help='Jobs belonging to same training run, suppose to '
+        'have the same name. It will be used to track progress of '
+        'a training done over multiple different jobs',
+    )
+    group.add_argument(
+        '--app-tag-run-version',
+        type=str,
+        default='0.0.0',
+        help='The version of the training of which current job is '
+        'part of. It will be used to track the changes in the '
+        'application side which might change the performance '
+        'baseline',
+    )
+    return parser
+
+
+def _add_ft_package_args(parser):
+    group = parser.add_argument_group(title='ft_package')
+    group.add_argument(
+        '--enable-ft-package',
+        action='store_true',
+        help='If set, Fault Tolerance package is enabled. '
+        'Note: This feature is for Nvidia internal use only.',
+    )
+    group.add_argument(
+        '--calc-ft-timeouts',
+        action='store_true',
+        help='If set, FT package will try to automatically compute the timeouts. '
+        'Note: This feature is for Nvidia internal use only.',
+    )
+    group.add_argument(
+        '--ft-num-warmup-iters',
+        type=int,
+        default=5,
+        help='Number of warmup iterations before monitoring step section and '
+        'out-of-section timeouts. The first N iterations are excluded from '
+        'timeout monitoring as they can be significantly slower than steady-state. '
+        'Default: 5. Note: This feature is for Nvidia internal use only.',
+    )
+    return parser
+
+
+def _add_logging_args(parser):
+    from megatron.training.config import LoggerConfig
+
+    log_factory = ArgumentGroupFactory(
+        LoggerConfig,
+        exclude=[
+            "log_throughput_to_tensorboard",
+            "throughput_window_size",
+            "memory_keys",
+            "log_l2_norm_grad_to_tensorboard",
+            "log_runtime_to_tensorboard",
+            "runtime_time_unit",
+            "filter_warnings",
+            "modules_to_filter",
+            "set_level_for_all_loggers",
+            "save_config_filepath",
+        ],
+    )
+    group = log_factory.build_group(parser, title="logging")
+
+    return parser
+
+
+def _add_regularization_args(parser):
+    group = parser.add_argument_group(title='regularization')
+
+    group.add_argument(
+        '--weight-decay',
+        type=float,
+        default=0.01,
+        help='Weight decay coefficient for L2 regularization.',
+    )
+    group.add_argument(
+        '--apply-wd-to-qk-layernorm',
+        action='store_true',
+        help='Apply weight decay to qk layernorm as a special case.',
+    )
+    group.add_argument(
+        '--clip-grad', type=float, default=1.0, help='Gradient clipping based on global L2 norm.'
+    )
+    group.add_argument(
+        '--adam-beta1',
+        type=float,
+        default=0.9,
+        help='First coefficient for computing running averages ' 'of gradient and its square',
+    )
+    group.add_argument(
+        '--adam-beta2',
+        type=float,
+        default=0.999,
+        help='Second coefficient for computing running averages ' 'of gradient and its square',
+    )
+    group.add_argument(
+        '--adam-eps',
+        type=float,
+        default=1e-08,
+        help='Term added to the denominator to improve' 'numerical stability',
+    )
+    group.add_argument('--sgd-momentum', type=float, default=0.9, help='Momentum factor for sgd')
+    group.add_argument(
+        '--muon-momentum', type=float, default=0.9, help='Momentum factor for Muon optimizer'
+    )
+    group.add_argument(
+        '--muon-no-split-qkv',
+        action='store_false',
+        default=True,
+        dest='muon_split_qkv',
+        help='Whether to split QKV parameters for Muon optimizer',
+    )
+    group.add_argument(
+        '--muon-nesterov',
+        action='store_true',
+        help='Whether to use Nesterov-style momentum in the internal SGD',
+    )
+    group.add_argument(
+        '--muon-scale-mode',
+        type=str,
+        default='spectral',
+        choices=['spectral', 'unit_rms_norm', 'shape_scaling'],
+        help='Scale mode for Muon optimizer. With MuP, set '
+        '--muon-scale-mode unit_rms_norm to use unit_rms_norm scaling, '
+        'or set --muon-scale-mode spectral to keep spectral scaling.',
+    )
+    group.add_argument(
+        '--muon-fp32-matmul-prec',
+        type=str,
+        default='medium',
+        choices=['low', 'medium', 'high'],
+        help='FP32 matmul precision for Newton-Schulz iteration',
+    )
+    group.add_argument(
+        '--muon-coefficient-type',
+        type=str,
+        default='quintic',
+        help='Newton-Schulz coefficient type for the Muon optimizer. '
+        'Valid types are discovered from the installed emerging_optimizers '
+        'package (e.g. simple, quintic, polar_express, aol). '
+        'Validated at optimizer creation time.',
+    )
+    group.add_argument(
+        '--muon-num-ns-steps',
+        type=int,
+        default=5,
+        help='Number of Newton-Schulz steps for Muon optimizer',
+    )
+    group.add_argument(
+        '--muon-tp-mode',
+        type=str,
+        default='blockwise',
+        choices=['blockwise', 'duplicated', 'distributed'],
+        help='How to perform NS calculation for tensor model parallel weights',
+    )
+    group.add_argument(
+        '--muon-extra-scale-factor',
+        type=float,
+        default=1.0,
+        help='Additional scale factor for the muon update',
+    )
+    group.add_argument(
+        '--muon-scalar-optimizer',
+        type=str,
+        default='adam',
+        choices=['adam', 'lion'],
+        help='Optimizer for scalar parameters (embeddings, biases, norms) '
+        'when using muon. Defaults to adam.',
+    )
+    group.add_argument(
+        '--lion-beta1',
+        type=float,
+        default=0.95,
+        help='First beta coefficient for Lion optimizer ' '(used in sign update). Default: 0.95.',
+    )
+    group.add_argument(
+        '--lion-beta2',
+        type=float,
+        default=0.98,
+        help='Second beta coefficient for Lion optimizer '
+        '(used in momentum EMA update). Default: 0.98.',
+    )
+
+    group.add_argument(
+        '--no-weight-decay-cond-type',
+        type=str,
+        choices=['apply_wd_to_qk_layernorm'],
+        help='Type of no weight decay condition. Choices: '
+        'None (default): apply weight decay to 1D weights and biases.'
+        '"apply_wd_to_qk_layernorm": additionally apply weight decay to '
+        'qk layernorm as a special case.'
+        'DEPRECATED. Please use --apply-wd-to-qk-layernorm instead. ',
+    )
+    return parser
+
+
+def _add_rl_args(parser):
+    group = parser.add_argument_group(title='rl')
+    group.add_argument('--perform-rl-step', action='store_true', help="Use the RL training step.")
+    group.add_argument(
+        '--rl-prompts-per-eval',
+        type=int,
+        default=32,
+        help='Number of prompts to evaluate for for each RL task.'
+        'This evaluation can be very expensive when using environments'
+        'that evaluate pass@k so we default to a lower number.',
+    )
+    # TODO(rkirby): allow for "complete" evaluation when --rl-prompts-per-eval is set to -1
+    group.add_argument(
+        '--grpo-prompts-per-step',
+        type=int,
+        default=32,
+        help="Number of GRPO groups (G in the paper).",
+    )
+    group.add_argument(
+        '--grpo-group-size', type=int, default=2, help="Number of samples per a GRPO group."
+    )
+    group.add_argument(
+        '--rl-num-parallel-generations',
+        type=int,
+        default=None,
+        help='Number of rollouts being generated by the inference engine simultaneously. '
+        'Internally divided by grpo_group_size. '
+        'Requires --rl-partial-rollouts. '
+        'Mutually exclusive with --rl-num-parallel-generation-batches.',
+    )
+    group.add_argument(
+        '--rl-num-parallel-generation-batches',
+        type=int,
+        default=None,
+        help='Number of generation batches in flight. '
+        'Set to L+1 to allow for L steps of staleness between the inference and training policies. '
+        'Each batch contains grpo_prompts_per_step groups by default. '
+        'Requires --rl-partial-rollouts. '
+        'Mutually exclusive with --rl-num-parallel-generations.',
+    )
+    group.add_argument(
+        '--rl-generation-batch-size',
+        type=int,
+        default=None,
+        help='Override the number of groups per generation batch. '
+        'Defaults to grpo_prompts_per_step when '
+        '--rl-num-parallel-generation-batches is set.',
+    )
+    group.add_argument(
+        '--grpo-iterations',
+        type=int,
+        default=2,
+        help="Number of iterations per a GRPO implementation.",
+    )
+    # As in DAPO, we keep upper/lower eps different.
+    # To have a vanilla GRPO, set them to be the same.
+    group.add_argument(
+        '--grpo-clamp-eps-lower', type=float, default=0.01, help="Lower GRPO clipping bound."
+    )
+    group.add_argument(
+        '--grpo-clamp-eps-upper',
+        type=float,
+        default=0.01,
+        help="Upper GRPO clipping bound. In vanilla implementation, equals to the lower one.",
+    )
+    group.add_argument(
+        '--grpo-kl-beta', type=float, default=0.001, help="KL term weight in the GRPO loss."
+    )
+    group.add_argument(
+        '--grpo-entropy-term-weight',
+        type=float,
+        default=0.0,
+        help="Entropy term weight in GRPO loss.",
+    )
+    group.add_argument(
+        '--grpo-filter-groups-with-same-reward',
+        action='store_true',
+        help="Filter groups with same reward.",
+    )
+    group.add_argument(
+        '--langrl-env-config',
+        type=str,
+        default=None,
+        help="Path to YAML config file for RL environment configuration.",
+    )
+    group.add_argument(
+        '--rl-default-temperature',
+        type=float,
+        default=1.0,
+        help="Default temperature for model inference.",
+    )
+    group.add_argument(
+        '--rl-default-top-p', type=float, default=0, help="Default top-p for model inference."
+    )
+    group.add_argument(
+        '--rl-default-top-k', type=int, default=-1, help="Default top-k for model inference."
+    )
+    group.add_argument(
+        '--rl-offload-optimizer-during-inference',
+        action='store_true',
+        help='Offload optimizer state to CPU during inference/rollout to save GPU memory',
+    )
+    group.add_argument(
+        '--rl-kv-cache-management-mode',
+        type=str,
+        default='persist',
+        choices=['persist', 'offload', 'recompute'],
+        help='KV cache management mode during RL training: '
+        'persist: leave KV cache in GPU memory (default), '
+        'offload: offload KV cache to CPU during training, '
+        'recompute: deallocate KV cache and recompute from scratch each cycle',
+    )
+    group.add_argument(
+        '--rl-persist-cuda-graphs',
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help='Persist CUDA graphs when the inference engine is suspended. '
+        'If False, CUDA graphs are deleted on suspend and re-captured on resume.',
+    )
+    group.add_argument(
+        '--rl-partial-rollouts',
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help='Allow inference to continue generating rollouts while training updates '
+        'the policy weights. This enables off-policy training where rollouts may '
+        'be generated with a stale version of the policy. Use '
+        '--rl-num-parallel-generations or --rl-num-parallel-generation-batches '
+        'to control the degree of staleness.',
+    )
+    group.add_argument(
+        '--rl-inference-logprobs-is-correction',
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help='If set, use inference logprobs in importance sampling correction of the loss.',
+    )
+    group.add_argument(
+        '--rl-importance-sampling-truncation-coef',
+        type=float,
+        default=None,
+        help="If --inference-logprobs-is-correction is on and this coefficient is set, apply truncation for the IS correction at GRPO loss.",
+    )
+    group.add_argument(
+        '--rl-use-sequence-packing',
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help='Enable sequence packing',
+    )
+    group.add_argument(
+        '--rl-sequence-packing-max-sequences-per-bin',
+        type=int,
+        default=50,
+        help='Maximum number of sequences that can be packed into a single bin. ',
+    )
+    group.add_argument(
+        '--rl-sequence-packing-algo',
+        type=str,
+        default='fifo',
+        choices=['fifo', 'round-robin'],
+        help='Algorithm for distributing packed bins across ranks. '
+        'fifo: first-in-first-out sequential distribution, '
+        'round-robin: distribute bins cyclically across ranks for better load balancing',
+    )
+    group.add_argument(
+        '--rl-training-cuda-graphs',
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help='If set, do not toggle CUDA graphs on/off between inference and training phases.',
+    )
+    group.add_argument(
+        '--rl-inference-tensor-model-parallel-size',
+        type=int,
+        default=None,
+        help='Degree of tensor model parallelism for inference for RL.',
+    )
+    group.add_argument(
+        '--rl-inference-pipeline-model-parallel-size',
+        type=int,
+        default=None,
+        help='Degree of pipeline model parallelism for inference for RL.',
+    )
+    group.add_argument(
+        '--rl-inference-expert-model-parallel-size',
+        type=int,
+        default=None,
+        help='Degree of expert model parallelism for inference for RL.',
+    )
+    group.add_argument(
+        '--rl-inference-expert-tensor-model-parallel-size',
+        type=int,
+        default=None,
+        help='Degree of expert tensor model parallelism for inference for RL. '
+        'For MoE models, this controls the TP size for expert layers specifically. '
+        'Defaults to training expert_tensor_parallel_size if not specified.',
+    )
+    group.add_argument(
+        '--rl-inference-model-unified-memory-level',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=(
+            'Allocate the separate RL inference model parameters from a unified virtual memory (UVM) '
+            'CUDA mempool. Level 0 disables UVM (default). Level 1 enables UVM allocation so the '
+            'inference model weights can be prefetched to CPU when idle while keeping CUDA-graph-safe '
+            'device pointers.'
         ),
     )
     group.add_argument(
@@ -2489,28 +3242,51 @@ def _add_rl_args(parser):
             '2) torch_memory_saver (when UVM is not enabled; requires torch_memory_saver to be installed).'
         ),
     )
-    group.add_argument('--refit-method', type=str, default='gloo',
-                       choices=['nccl', 'gloo', 'nvshmem'],
-                       help=('Method to refit the model weights between training and inference models during RL. '
-                             'nccl: use NCCLCopyService to refit using NCCL; '
-                             'gloo: use GlooCopyService over CPU; '
-                             'nvshmem: use NVSHMEMCopyService to refit using the NVSHMEM.'))
-    group.add_argument('--rl-verify-model-weights-swap', action=argparse.BooleanOptionalAction, default=False,
-                       help='If set, verify that the model weights were correctly transferred by comparing forward pass outputs on'
-                       'the first swap of model weights.')
-
-    group.add_argument('--rl-parallel-generation-tasks', type=int, default=None,
-                       help='Deprecated: use --rl-num-parallel-generations instead.')
-    group.add_argument('--rl-skip-bos-token', action=argparse.BooleanOptionalAction, type=bool, default=False,
-                        help='Skip BOS token at the beginning of the sequences. Default is False.')
-    group.add_argument('--rl-inference-parsers', nargs='*', default=[],
-                       help='List of response parsers to enable for RL inference '
-                            '(e.g. --rl-inference-parsers deepseek-r1-reasoning qwen3-coder-tool).')
+    group.add_argument(
+        '--refit-method',
+        type=str,
+        default='gloo',
+        choices=['nccl', 'gloo', 'nvshmem'],
+        help=(
+            'Method to refit the model weights between training and inference models during RL. '
+            'nccl: use NCCLCopyService to refit using NCCL; '
+            'gloo: use GlooCopyService over CPU; '
+            'nvshmem: use NVSHMEMCopyService to refit using the NVSHMEM.'
+        ),
+    )
+    group.add_argument(
+        '--rl-verify-model-weights-swap',
+        action=argparse.BooleanOptionalAction,
+        default=False,
+        help='If set, verify that the model weights were correctly transferred by comparing forward pass outputs on'
+        'the first swap of model weights.',
+    )
+
+    group.add_argument(
+        '--rl-parallel-generation-tasks',
+        type=int,
+        default=None,
+        help='Deprecated: use --rl-num-parallel-generations instead.',
+    )
+    group.add_argument(
+        '--rl-skip-bos-token',
+        action=argparse.BooleanOptionalAction,
+        type=bool,
+        default=False,
+        help='Skip BOS token at the beginning of the sequences. Default is False.',
+    )
+    group.add_argument(
+        '--rl-inference-parsers',
+        nargs='*',
+        default=[],
+        help='List of response parsers to enable for RL inference '
+        '(e.g. --rl-inference-parsers deepseek-r1-reasoning qwen3-coder-tool).',
+    )
     return parser
 
+
 def _add_training_args(parser):
-    from megatron.training.config import TrainingConfig
-    from megatron.training.config import ProfilingConfig
+    from megatron.training.config import ProfilingConfig, TrainingConfig
 
     prof_factory = ArgumentGroupFactory(ProfilingConfig)
     prof_group = prof_factory.build_group(parser, "profiling")
@@ -2518,96 +3294,191 @@ def _add_training_args(parser):
     train_factory = ArgumentGroupFactory(TrainingConfig)
     group = train_factory.build_group(parser, "training")
 
-    group.add_argument('--batch-size', type=int, default=None,
-                       help='Old batch size parameter, do not use. '
-                       'Use --micro-batch-size instead')
-    group.add_argument('--recompute-activations', action='store_true',
-                       help='recompute activation to allow for training '
-                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--no-check-for-nan-in-loss-and-grad', action='store_false',
-                       help='Check for NaNs in loss and grad',
-                       dest='check_for_nan_in_loss_and_grad')
-    group.add_argument('--check-for-large-grads', action='store_true',
-                       help='Check for unexpectedly large grads',
-                       dest='check_for_large_grads')
-    group.add_argument('--result-rejected-tracker-filename', type=str, default=None,
-                       help='Optional name of file tracking `result_rejected` events.')
-    group.add_argument('--tp-comm-overlap-cfg', type=str, default=None,
-                       help='Config file when tp_comm_overlap is enabled.')
+    group.add_argument(
+        '--batch-size',
+        type=int,
+        default=None,
+        help='Old batch size parameter, do not use. ' 'Use --micro-batch-size instead',
+    )
+    group.add_argument(
+        '--recompute-activations',
+        action='store_true',
+        help='recompute activation to allow for training '
+        'with larger models, sequences, and batch sizes.',
+    )
+    group.add_argument(
+        '--no-check-for-nan-in-loss-and-grad',
+        action='store_false',
+        help='Check for NaNs in loss and grad',
+        dest='check_for_nan_in_loss_and_grad',
+    )
+    group.add_argument(
+        '--check-for-large-grads',
+        action='store_true',
+        help='Check for unexpectedly large grads',
+        dest='check_for_large_grads',
+    )
+    group.add_argument(
+        '--result-rejected-tracker-filename',
+        type=str,
+        default=None,
+        help='Optional name of file tracking `result_rejected` events.',
+    )
+    group.add_argument(
+        '--tp-comm-overlap-cfg',
+        type=str,
+        default=None,
+        help='Config file when tp_comm_overlap is enabled.',
+    )
 
     # deprecated
-    group.add_argument('--checkpoint-activations', action='store_true',
-                       help='Checkpoint activation to allow for training '
-                       'with larger models, sequences, and batch sizes.')
-    group.add_argument('--no-masked-softmax-fusion',
-                       action='store_false',
-                       help='Disable fusion of query_key_value scaling, '
-                       'masking, and softmax.',
-                       dest='masked_softmax_fusion')
-    group.add_argument('--no-bias-gelu-fusion', action='store_false',
-                       help='Disable bias and gelu fusion.',
-                       dest='bias_gelu_fusion')
-    group.add_argument('--no-bias-swiglu-fusion', action='store_false',
-                       help='Disable bias and swiglu fusion, the fusion is '
-                       'available only when using megatron-core.',
-                       dest='bias_swiglu_fusion')
-    group.add_argument('--no-bias-dropout-fusion', action='store_false',
-                       help='Disable bias and dropout fusion.',
-                       dest='bias_dropout_fusion')
-    group.add_argument('--no-rope-fusion', action='store_false',
-                       help='Disable rope fusion, the fusion is available '
-                       'only when using megatron-core.',
-                       dest='apply_rope_fusion')
-    group.add_argument('--rope-type', type=str, default=None,
-                      choices=['rope', 'yarn'],
-                      help='Type of rope to use. Note that MLA takes yarn by default, '
-                      'and common attention takes rope by default.')
-    group.add_argument('--use-flash-attn', action='store_true',
-                       help='use FlashAttention implementation of attention. '
-                       'https://arxiv.org/abs/2205.14135')
-    group.add_argument('--optimizer', type=str, default='adam',
-                       choices=['adam', 'sgd', 'muon', 'dist_muon', 'lion', 'soap', 'adaptive_muon'],
-                       help='Optimizer function. '
-                            'Note: dist_muon is deprecated; use --optimizer muon '
-                            'with --use-distributed-optimizer instead.')
-    group.add_argument('--optimizer-cpu-offload', action='store_true',
-                       help='Offload optimizer state to CPU')
-    group.add_argument('--optimizer-cuda-graph', action='store_true',
-                       help='Enable CUDA graph for optimizer step')
-    group.add_argument('--optimizer-offload-fraction', type=float, default=1.0,
-                          help='Ratio of optimizer state to offload to CPU')
-    group.add_argument('--use-torch-optimizer-for-cpu-offload', action='store_true',
-                       help="Use torch.optim.Optimizer instead of Megatron's optimizer in optimizer cpu offload mode.")
-    group.add_argument('--overlap-cpu-optimizer-d2h-h2d', action='store_true', default=False,
-                       help='Overlap CPU optimizer step, gradients D2H and updated parameters H2D.')
-    group.add_argument('--dump-param-to-param-group-map', type=str, default=None,
-                        help="Path to a file containing parameter-to-parameter-group mapping. "
-                        "Provide a JSON file that specifies which parameters belong to which "
-                        "parameter group for global coordination.")
-    group.add_argument('--no-pin-cpu-grads', action='store_false', dest='pin_cpu_grads',
-                       help='Disable pinning of CPU memory for gradients.')
-    group.add_argument('--no-pin-cpu-params', action='store_false', dest='pin_cpu_params',
-                       help='Disable pinning of CPU memory for parameters.')
-    group.add_argument('--dataloader-type', type=str, default=None,
-                       choices=['single', 'cyclic', 'external'],
-                       help='Single pass vs multiple pass data loader')
-    group.add_argument('--no-persist-layer-norm', action='store_true',
-                       help='Disable using persistent fused layer norm kernel. '
-                       'This kernel supports only a set of hidden sizes. Please '
-                       'check persist_ln_hidden_sizes if your hidden '
-                       'size is supported.')
-    group.add_argument('--no-gradient-accumulation-fusion',
-                       action='store_false',
-                       help='Disable fusing gradient accumulation to weight '
-                       'gradient computation of linear layers',
-                       dest='gradient_accumulation_fusion')
-    group.add_argument('--use-mcore-models', action='store_true',
-                       dest='deprecated_use_mcore_models',
-                       help='DEPRECATED. Use the implementation from megatron core.'
-                       'Now ignored and mcore models are the default, use '
-                       '--use-legacy-models to not use core models.')
-    group.add_argument('--use-legacy-models', action='store_true',
-                       help='Use the legacy Megatron models, not Megatron-Core models.')
+    group.add_argument(
+        '--checkpoint-activations',
+        action='store_true',
+        help='Checkpoint activation to allow for training '
+        'with larger models, sequences, and batch sizes.',
+    )
+    group.add_argument(
+        '--no-masked-softmax-fusion',
+        action='store_false',
+        help='Disable fusion of query_key_value scaling, ' 'masking, and softmax.',
+        dest='masked_softmax_fusion',
+    )
+    group.add_argument(
+        '--no-bias-gelu-fusion',
+        action='store_false',
+        help='Disable bias and gelu fusion.',
+        dest='bias_gelu_fusion',
+    )
+    group.add_argument(
+        '--no-bias-swiglu-fusion',
+        action='store_false',
+        help='Disable bias and swiglu fusion, the fusion is '
+        'available only when using megatron-core.',
+        dest='bias_swiglu_fusion',
+    )
+    group.add_argument(
+        '--no-bias-dropout-fusion',
+        action='store_false',
+        help='Disable bias and dropout fusion.',
+        dest='bias_dropout_fusion',
+    )
+    group.add_argument(
+        '--no-rope-fusion',
+        action='store_false',
+        help='Disable rope fusion, the fusion is available ' 'only when using megatron-core.',
+        dest='apply_rope_fusion',
+    )
+    group.add_argument(
+        '--rope-type',
+        type=str,
+        default=None,
+        choices=['rope', 'yarn'],
+        help='Type of rope to use. Note that MLA takes yarn by default, '
+        'and common attention takes rope by default.',
+    )
+    group.add_argument(
+        '--use-flash-attn',
+        action='store_true',
+        help='use FlashAttention implementation of attention. ' 'https://arxiv.org/abs/2205.14135',
+    )
+    group.add_argument(
+        '--optimizer',
+        type=str,
+        default='adam',
+        choices=['adam', 'sgd', 'muon', 'dist_muon', 'lion', 'soap', 'adaptive_muon'],
+        help='Optimizer function. '
+        'Note: dist_muon is deprecated; use --optimizer muon '
+        'with --use-distributed-optimizer instead.',
+    )
+    group.add_argument(
+        '--optimizer-cpu-offload', action='store_true', help='Offload optimizer state to CPU'
+    )
+    group.add_argument(
+        '--optimizer-cuda-graph', action='store_true', help='Enable CUDA graph for optimizer step'
+    )
+    group.add_argument(
+        '--optimizer-offload-fraction',
+        type=float,
+        default=1.0,
+        help='Ratio of optimizer state to offload to CPU',
+    )
+    group.add_argument(
+        '--use-torch-optimizer-for-cpu-offload',
+        action='store_true',
+        help="Use torch.optim.Optimizer instead of Megatron's optimizer in optimizer cpu offload mode.",
+    )
+    group.add_argument(
+        '--overlap-cpu-optimizer-d2h-h2d',
+        action='store_true',
+        default=False,
+        help='Overlap CPU optimizer step, gradients D2H and updated parameters H2D.',
+    )
+    group.add_argument(
+        '--dump-param-to-param-group-map',
+        type=str,
+        default=None,
+        help="Path to a file containing parameter-to-parameter-group mapping. "
+        "Provide a JSON file that specifies which parameters belong to which "
+        "parameter group for global coordination.",
+    )
+    group.add_argument(
+        '--no-pin-cpu-grads',
+        action='store_false',
+        dest='pin_cpu_grads',
+        help='Disable pinning of CPU memory for gradients.',
+    )
+    group.add_argument(
+        '--no-pin-cpu-params',
+        action='store_false',
+        dest='pin_cpu_params',
+        help='Disable pinning of CPU memory for parameters.',
+    )
+    group.add_argument(
+        '--offload-optimizer-states',
+        action='store_true',
+        dest='offload_optimizer_states',
+        help='Offload optimizer states to CPU after each optimizer step and '
+        'reload them before the next optimizer step. '
+        'Only support TE FusedAdam optimizer.'
+        'Note that this still uses pure GPU optimizer instead of '
+        'HybridDeviceOptimizer for --optimizer-cpu-offload.',
+    )
+    group.add_argument(
+        '--dataloader-type',
+        type=str,
+        default=None,
+        choices=['single', 'cyclic', 'external'],
+        help='Single pass vs multiple pass data loader',
+    )
+    group.add_argument(
+        '--no-persist-layer-norm',
+        action='store_true',
+        help='Disable using persistent fused layer norm kernel. '
+        'This kernel supports only a set of hidden sizes. Please '
+        'check persist_ln_hidden_sizes if your hidden '
+        'size is supported.',
+    )
+    group.add_argument(
+        '--no-gradient-accumulation-fusion',
+        action='store_false',
+        help='Disable fusing gradient accumulation to weight '
+        'gradient computation of linear layers',
+        dest='gradient_accumulation_fusion',
+    )
+    group.add_argument(
+        '--use-mcore-models',
+        action='store_true',
+        dest='deprecated_use_mcore_models',
+        help='DEPRECATED. Use the implementation from megatron core.'
+        'Now ignored and mcore models are the default, use '
+        '--use-legacy-models to not use core models.',
+    )
+    group.add_argument(
+        '--use-legacy-models',
+        action='store_true',
+        help='Use the legacy Megatron models, not Megatron-Core models.',
+    )
 
     return parser
 
@@ -2627,8 +3498,11 @@ def _add_initialization_args(parser):
     rng_factory = ArgumentGroupFactory(RNGConfig)
     group = rng_factory.build_group(parser, "RNG and initialization")
 
-    group.add_argument('--init-method-xavier-uniform', action='store_true',
-                       help='Enable Xavier uniform parameter initialization')
+    group.add_argument(
+        '--init-method-xavier-uniform',
+        action='store_true',
+        help='Enable Xavier uniform parameter initialization',
+    )
 
     return parser
 
@@ -2639,21 +3513,39 @@ def _add_learning_rate_args(parser):
     sched_factory = ArgumentGroupFactory(SchedulerConfig, exclude=["no_weight_decay_cond_type"])
     group = sched_factory.build_group(parser, title="learning rate and weight decay")
 
-    group.add_argument('--lr', type=float, default=None,
-                       help='Initial learning rate. Depending on decay style '
-                       'and initial warmup, the learning rate at each '
-                       'iteration would be different.')
-    group.add_argument('--warmup', type=int, default=None,
-                       help='Old lr warmup argument, do not use. Use one of the'
-                       '--lr-warmup-* arguments above')
-    group.add_argument('--min-lr', type=float, default=0.0,
-                       help='Minimum value for learning rate. The scheduler'
-                       'clip values below this threshold.')
-    group.add_argument('--decoupled-lr', type=float, default=None,
-                       help='Separate learning rate for the input and output layer')
-    group.add_argument('--decoupled-min-lr', type=float, default=None,
-                       help='Minimum value for learning rate for the input and output layer. The scheduler'
-                       'clip values below this threshold')
+    group.add_argument(
+        '--lr',
+        type=float,
+        default=None,
+        help='Initial learning rate. Depending on decay style '
+        'and initial warmup, the learning rate at each '
+        'iteration would be different.',
+    )
+    group.add_argument(
+        '--warmup',
+        type=int,
+        default=None,
+        help='Old lr warmup argument, do not use. Use one of the' '--lr-warmup-* arguments above',
+    )
+    group.add_argument(
+        '--min-lr',
+        type=float,
+        default=0.0,
+        help='Minimum value for learning rate. The scheduler' 'clip values below this threshold.',
+    )
+    group.add_argument(
+        '--decoupled-lr',
+        type=float,
+        default=None,
+        help='Separate learning rate for the input and output layer',
+    )
+    group.add_argument(
+        '--decoupled-min-lr',
+        type=float,
+        default=None,
+        help='Minimum value for learning rate for the input and output layer. The scheduler'
+        'clip values below this threshold',
+    )
 
     return parser
 
@@ -2661,60 +3553,120 @@ def _add_learning_rate_args(parser):
 def _add_checkpointing_args(parser):
     from megatron.training.config import CheckpointConfig
 
-    ckpt_factory = ArgumentGroupFactory(CheckpointConfig, exclude=["most_recent_k", "save_tokenizer_assets", "save_optim", "save_rng", "load_optim", "load_rng"])
+    ckpt_factory = ArgumentGroupFactory(
+        CheckpointConfig,
+        exclude=[
+            "most_recent_k",
+            "save_tokenizer_assets",
+            "save_optim",
+            "save_rng",
+            "load_optim",
+            "load_rng",
+        ],
+    )
     group = ckpt_factory.build_group(parser, "checkpointing")
 
-    group.add_argument('--no-save-optim', action='store_true', default=None,
-                       help='Do not save current optimizer.')
-    group.add_argument('--no-save-rng', action='store_true', default=None,
-                       help='Do not save current rng state.')
-    group.add_argument('--no-load-optim', action='store_true', default=None,
-                       help='Do not load optimizer when loading checkpoint.')
-    group.add_argument('--no-load-rng', action='store_true', default=None,
-                       help='Do not load rng state when loading checkpoint.')
-    group.add_argument('--use-dist-ckpt', action='store_true',
-                       dest='use_dist_ckpt_deprecated',
-                       help='Deprecated: see --ckpt-format.')
-    group.add_argument('--dist-ckpt-format',
-                       dest='dist_ckpt_format_deprecated',
-                       help='Deprecated: see --ckpt-format.')
-    group.add_argument('--dist-ckpt-workers', type=int, default=1,
-                       help='Number of workers for distributed checkpointing. '
-                       'Only used for async save. '
-                       'If set to 1, the checkpointing is performed in a single process.')
-    group.add_argument('--ckpt-fully-parallel-save', action='store_true',
-                       dest='ckpt_fully_parallel_save_deprecated',
-                       help='Deprecated: see --no-ckpt-fully-parallel-save.')
+    group.add_argument(
+        '--no-save-optim', action='store_true', default=None, help='Do not save current optimizer.'
+    )
+    group.add_argument(
+        '--no-save-rng', action='store_true', default=None, help='Do not save current rng state.'
+    )
+    group.add_argument(
+        '--no-load-optim',
+        action='store_true',
+        default=None,
+        help='Do not load optimizer when loading checkpoint.',
+    )
+    group.add_argument(
+        '--no-load-rng',
+        action='store_true',
+        default=None,
+        help='Do not load rng state when loading checkpoint.',
+    )
+    group.add_argument(
+        '--use-dist-ckpt',
+        action='store_true',
+        dest='use_dist_ckpt_deprecated',
+        help='Deprecated: see --ckpt-format.',
+    )
+    group.add_argument(
+        '--dist-ckpt-format',
+        dest='dist_ckpt_format_deprecated',
+        help='Deprecated: see --ckpt-format.',
+    )
+    group.add_argument(
+        '--dist-ckpt-workers',
+        type=int,
+        default=1,
+        help='Number of workers for distributed checkpointing. '
+        'Only used for async save. '
+        'If set to 1, the checkpointing is performed in a single process.',
+    )
+    group.add_argument(
+        '--ckpt-fully-parallel-save',
+        action='store_true',
+        dest='ckpt_fully_parallel_save_deprecated',
+        help='Deprecated: see --no-ckpt-fully-parallel-save.',
+    )
     return parser
 
 
 def _add_mixed_precision_args(parser):
     group = parser.add_argument_group(title='mixed precision')
 
-    group.add_argument('--grad-reduce-in-bf16', action='store_true',
-                       help='Reduce gradients in bfloat16.')
-    group.add_argument('--loss-scale', type=float, default=None,
-                       help='Static loss scaling, positive power of 2 '
-                       'values can improve fp16 convergence. If None, dynamic'
-                       'loss scaling is used.')
-    group.add_argument('--initial-loss-scale', type=float, default=2**32,
-                       help='Initial loss-scale for dynamic loss scaling.')
-    group.add_argument('--min-loss-scale', type=float, default=1.0,
-                       help='Minimum loss scale for dynamic loss scaling.')
-    group.add_argument('--loss-scale-window', type=float, default=1000,
-                       help='Window over which to raise/lower dynamic scale.')
-    group.add_argument('--hysteresis', type=int, default=2,
-                       help='hysteresis for dynamic loss scaling')
-    group.add_argument('--attention-softmax-in-fp32', action='store_true',
-                       help='Run attention masking and softmax in fp32.')
-    group.add_argument('--accumulate-allreduce-grads-in-fp32',
-                       action='store_true',
-                       help='Gradient accumulation and all-reduce in fp32.')
-    group.add_argument('--fp16-lm-cross-entropy', action='store_true',
-                       help='Move the cross entropy unreduced loss calculation'
-                       'for lm head to fp16.')
-    group.add_argument('--reuse-grad-buf-for-mxfp8-param-ag', action='store_true',
-                       help='If True, reuse the grad buffer for MXFP8 parameter all-gather.')
+    group.add_argument(
+        '--grad-reduce-in-bf16', action='store_true', help='Reduce gradients in bfloat16.'
+    )
+    group.add_argument(
+        '--loss-scale',
+        type=float,
+        default=None,
+        help='Static loss scaling, positive power of 2 '
+        'values can improve fp16 convergence. If None, dynamic'
+        'loss scaling is used.',
+    )
+    group.add_argument(
+        '--initial-loss-scale',
+        type=float,
+        default=2**32,
+        help='Initial loss-scale for dynamic loss scaling.',
+    )
+    group.add_argument(
+        '--min-loss-scale',
+        type=float,
+        default=1.0,
+        help='Minimum loss scale for dynamic loss scaling.',
+    )
+    group.add_argument(
+        '--loss-scale-window',
+        type=float,
+        default=1000,
+        help='Window over which to raise/lower dynamic scale.',
+    )
+    group.add_argument(
+        '--hysteresis', type=int, default=2, help='hysteresis for dynamic loss scaling'
+    )
+    group.add_argument(
+        '--attention-softmax-in-fp32',
+        action='store_true',
+        help='Run attention masking and softmax in fp32.',
+    )
+    group.add_argument(
+        '--accumulate-allreduce-grads-in-fp32',
+        action='store_true',
+        help='Gradient accumulation and all-reduce in fp32.',
+    )
+    group.add_argument(
+        '--fp16-lm-cross-entropy',
+        action='store_true',
+        help='Move the cross entropy unreduced loss calculation' 'for lm head to fp16.',
+    )
+    group.add_argument(
+        '--reuse-grad-buf-for-mxfp8-param-ag',
+        action='store_true',
+        help='If True, reuse the grad buffer for MXFP8 parameter all-gather.',
+    )
 
     return parser
 
@@ -2725,117 +3677,246 @@ def _add_distributed_args(parser):
     dist_init_factory = ArgumentGroupFactory(DistributedInitConfig)
     group = dist_init_factory.build_group(parser, "distributed init")
 
-    group.add_argument('--decoder-first-pipeline-num-layers',
-                       type=int, default=None,
-                       help=('The number of transformer layers on the first pipeline stage of the decoder. '
-                       'Default None is even split of transformer layers across all pipeline stages'))
-    group.add_argument('--decoder-last-pipeline-num-layers',
-                       type=int, default=None,
-                       help=('The number of transformer layers on the last pipeline stage of the decoder. '
-                       'Default None is even split of transformer layers across all pipeline stages'))
-    group.add_argument('--pipeline-model-parallel-layout',
-                       type=str, default=None,
-                       help=('A string that describes a custom pipeline model parallel layout. '
-                       'e.g., "E|(t|)*3,m|m||L". E, L, t, m denotes embedding, loss, transformer '
-                       'decoder layer, and mtp layer, respectively. Stages are split by "|". '
-                       'Replicated stages or layers can be described with multiplication. '
-                       'Commas can be used cosmetically. '
-                       'Default None is not using this argument to set the layout.'))
-    group.add_argument('--model-parallel-size', type=int, default=None,
-                       help='Old model parallel argument, do not use. Use '
-                       '--tensor-model-parallel-size instead.')
-    group.add_argument('--num-layers-per-virtual-pipeline-stage', type=int, default=None,
-                       help='Number of layers per virtual pipeline stage')
-    group.add_argument('--num-virtual-stages-per-pipeline-rank', type=int, default=None,
-                       help='Number of virtual pipeline stages per pipeline parallelism rank')
-    group.add_argument('--no-overlap-p2p-communication', action='store_false',
-                       help='overlap pipeline parallel communication with forward and backward chunks in 1F1B',
-                       dest='overlap_p2p_comm')
-    group.add_argument('--overlap-grad-reduce', action='store_true',
-                       default=False, help='If set, overlap DDP grad reduce.')
-    group.add_argument('--ddp-num-buckets', type=int, default=None,
-                       help='Number of buckets for data-parallel communication')
-    group.add_argument('--ddp-bucket-size', type=int, default=None,
-                       help='Bucket size for data-parallel communication')
-    group.add_argument('--ddp-pad-buckets-for-high-nccl-busbw', action='store_true',
-                       default=False, help='If set, make sure the bucket size is divisible by a large power '
-                       'of 2 (2^16) to ensure NCCL collectives have high bus bandwidth at large DP counts, '
-                       'since NCCL message size (which for ring algorithms is bucket_size / dp_size) '
-                       'apparently needs to be divisible by a power of 2 for high busbw.')
-    group.add_argument('--ddp-reduce-scatter-with-fp32-accumulation', action='store_true',
-                       default=False, help='If set, use a reduce-scatter implementation which sends lower-precision '
-                       'values over the wire (using an all-to-all to keep total communication overhead in line '
-                       'with the standard ring implementation) but performs accumulation locally in FP32.')
-    group.add_argument('--ddp-param-name-patterns-for-fp32-local-accumulation', nargs='+', default=[],
-                       help='List of param_name patterns (in Python\'s fnmatch format) to match against '
-                       'to do local gradient accumulation in FP32. The special pattern \'all\' matches '
-                       'every parameter.')
-    group.add_argument('--ddp-average-in-collective', action='store_true',
-                       default=False, help='If set, average directly in data-parallel communication collective.')
-    group.add_argument('--overlap-param-gather', action='store_true',
-                       default=False, help='If set, overlap param all-gather in distributed optimizer.')
-    group.add_argument('--overlap-param-gather-with-optimizer-step', action='store_true',
-                       default=False, help='If set, overlap param all-gather of first bucket with optimizer step.')
-    group.add_argument('--no-align-param-gather', action='store_false',
-                       help='If not set, all PP stages will launch param all-gathers simultaneously. '
-                       'Otherwise, each PP stage will independently launch as needed.',
-                       dest='align_param_gather')
-    group.add_argument('--use-distributed-optimizer', action='store_true',
-                       help='Use distributed optimizer.')
-    group.add_argument('--use-nccl-ub', action='store_true', dest='nccl_ub',
-                       help='Use the userbuffer registration for DP/FSDP communication buffers.'
-                       'This option will reduce GPU SM usage for the DP/FSDP communication,'
-                       'which is improving the performance of the overlapped computation.')
-    group.add_argument('--disable-symmetric-registration', action='store_true', dest='disable_symmetric_registration',
-                       default=False, help='Disable symmetric (window) registration for NCCL userbuffer registration.'
-                       'This option will force to use conventional (local) userbuffer registration when use-nccl-ub is set.')
-    group.add_argument('--fsdp-manual-registration', action='store_true', dest='fsdp_manual_registration',
-                       default=False, help='Manually register the FSDP communication buffers to NCCL user buffer.'
-                       'This option is only effective when use-megatron-fsdp and use-nccl-ub is set.')
-    group.add_argument('--create-all-gather-group', action='store_true',
-                       help='Create a separate process group for all-gather operations '
-                       'to overlap reduce-scatter and all-gather operations.')
-    group.add_argument('--data-parallel-sharding-strategy', type=str, default='optim_grads_params',
-                       choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'],
-                       help='Sharding strategy of data parallelism.')
-    group.add_argument('--outer-dp-sharding-strategy', type=str, default='no_shard',
-                       choices=['no_shard', 'optim'],
-                       help='Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode. '
-                            'Valid values are "no_shard" (DP Replication) and "optim" (Optimizer State Hybrid Sharding). '
-                            'The "optim" option is only supported when --data-parallel-sharding-strategy is "optim_grads_params". '
-                            'This option is only effective when Hybrid FSDP is enabled (i.e., when dp_outer_dim is not None). '
-                            'Default: "no_shard".')
-    group.add_argument('--no-gradient-reduce-div-fusion', action='store_false', dest='gradient_reduce_div_fusion',
-                       help='If not set, fuse the division in gradient reduce.')
-    group.add_argument('--fsdp-double-buffer', action='store_true',
-                       help="Enable double buffering for temporary memory needed for Megatron FSDP communications. "
-                        "Double-buffering the communication memory improves memory management efficiency by "
-                        "reusing previously allocated buffers, rather than creating new buffers for each FSDP communication. "
-                        "This is required for user buffer registration and is enabled by default when using NCCL user buffers.")
-    group.add_argument('--suggested-communication-unit-size', type=int, default=None,
-                   help='Specifies the number of elements to communicate at once during FSDP (Fully Sharded Data Parallel) operations. '
-                        'This flag also affects FSDP all-gather prefetch behavior. Setting a larger value increases the communication buffer size, '
-                        'while a smaller value disables prefetching and may degrade performance. Adjust this value based on your system\'s memory '
-                        'and performance requirements.')
-    group.add_argument('--keep-fp8-transpose-cache', action='store_true',
-                       help='If set, keep the fp8 transpose cache when using Megatron FSDP.')
-    group.add_argument('--enable-full-sharding-in-hsdp', action='store_true',
-                       help='If set, enable full sharding in megatron-fsdp Hybrid Sharded Data Parallel (HSDP) mode.')
-    group.add_argument('--num-distributed-optimizer-instances', type=int, default=1,
-                       help='Number of Distributed Optimizer copies across Data Parallel domain.')
-    group.add_argument('--torch-fsdp2-no-reshard-after-forward', action='store_false', dest='torch_fsdp2_reshard_after_forward',
-                       help='Whether to reshard weights after forward pass when using PyTorch FSDP2. '
-                       'Set to enable FSDP ZeRO-2.')
-    group.add_argument('--cp-comm-type', nargs='+', type=str, default=["p2p"],
-                       help='Inter-gpu communication type for context parallelism: '
-                       'p2p, a2a, allgather or a2a+p2p. If a single string is provided, '
-                       'all layers will share the same communication type. Users can also '
-                       'specify separated types for each layer like '
-                       '--cp-comm-type p2p p2p a2a a2a a2a+p2p a2a+p2p')
-    group.add_argument('--fake-process-group', action='store_true', default=False,
-                       help='If set, initialize with fake distributed process group and all distributed communication operations will be skipped. \
+    group.add_argument(
+        '--decoder-first-pipeline-num-layers',
+        type=int,
+        default=None,
+        help=(
+            'The number of transformer layers on the first pipeline stage of the decoder. '
+            'Default None is even split of transformer layers across all pipeline stages'
+        ),
+    )
+    group.add_argument(
+        '--decoder-last-pipeline-num-layers',
+        type=int,
+        default=None,
+        help=(
+            'The number of transformer layers on the last pipeline stage of the decoder. '
+            'Default None is even split of transformer layers across all pipeline stages'
+        ),
+    )
+    group.add_argument(
+        '--pipeline-model-parallel-layout',
+        type=str,
+        default=None,
+        help=(
+            'A string that describes a custom pipeline model parallel layout. '
+            'e.g., "E|(t|)*3,m|m||L". E, L, t, m denotes embedding, loss, transformer '
+            'decoder layer, and mtp layer, respectively. Stages are split by "|". '
+            'Replicated stages or layers can be described with multiplication. '
+            'Commas can be used cosmetically. '
+            'Default None is not using this argument to set the layout.'
+        ),
+    )
+    group.add_argument(
+        '--model-parallel-size',
+        type=int,
+        default=None,
+        help='Old model parallel argument, do not use. Use '
+        '--tensor-model-parallel-size instead.',
+    )
+    group.add_argument(
+        '--num-layers-per-virtual-pipeline-stage',
+        type=int,
+        default=None,
+        help='Number of layers per virtual pipeline stage',
+    )
+    group.add_argument(
+        '--num-virtual-stages-per-pipeline-rank',
+        type=int,
+        default=None,
+        help='Number of virtual pipeline stages per pipeline parallelism rank',
+    )
+    group.add_argument(
+        '--no-overlap-p2p-communication',
+        action='store_false',
+        help='overlap pipeline parallel communication with forward and backward chunks in 1F1B',
+        dest='overlap_p2p_comm',
+    )
+    group.add_argument(
+        '--overlap-grad-reduce',
+        action='store_true',
+        default=False,
+        help='If set, overlap DDP grad reduce.',
+    )
+    group.add_argument(
+        '--ddp-num-buckets',
+        type=int,
+        default=None,
+        help='Number of buckets for data-parallel communication',
+    )
+    group.add_argument(
+        '--ddp-bucket-size',
+        type=int,
+        default=None,
+        help='Bucket size for data-parallel communication',
+    )
+    group.add_argument(
+        '--ddp-pad-buckets-for-high-nccl-busbw',
+        action='store_true',
+        default=False,
+        help='If set, make sure the bucket size is divisible by a large power '
+        'of 2 (2^16) to ensure NCCL collectives have high bus bandwidth at large DP counts, '
+        'since NCCL message size (which for ring algorithms is bucket_size / dp_size) '
+        'apparently needs to be divisible by a power of 2 for high busbw.',
+    )
+    group.add_argument(
+        '--ddp-reduce-scatter-with-fp32-accumulation',
+        action='store_true',
+        default=False,
+        help='If set, use a reduce-scatter implementation which sends lower-precision '
+        'values over the wire (using an all-to-all to keep total communication overhead in line '
+        'with the standard ring implementation) but performs accumulation locally in FP32.',
+    )
+    group.add_argument(
+        '--ddp-param-name-patterns-for-fp32-local-accumulation',
+        nargs='+',
+        default=[],
+        help='List of param_name patterns (in Python\'s fnmatch format) to match against '
+        'to do local gradient accumulation in FP32. The special pattern \'all\' matches '
+        'every parameter.',
+    )
+    group.add_argument(
+        '--ddp-average-in-collective',
+        action='store_true',
+        default=False,
+        help='If set, average directly in data-parallel communication collective.',
+    )
+    group.add_argument(
+        '--overlap-param-gather',
+        action='store_true',
+        default=False,
+        help='If set, overlap param all-gather in distributed optimizer.',
+    )
+    group.add_argument(
+        '--overlap-param-gather-with-optimizer-step',
+        action='store_true',
+        default=False,
+        help='If set, overlap param all-gather of first bucket with optimizer step.',
+    )
+    group.add_argument(
+        '--no-align-param-gather',
+        action='store_false',
+        help='If not set, all PP stages will launch param all-gathers simultaneously. '
+        'Otherwise, each PP stage will independently launch as needed.',
+        dest='align_param_gather',
+    )
+    group.add_argument(
+        '--use-distributed-optimizer', action='store_true', help='Use distributed optimizer.'
+    )
+    group.add_argument(
+        '--use-nccl-ub',
+        action='store_true',
+        dest='nccl_ub',
+        help='Use the userbuffer registration for DP/FSDP communication buffers.'
+        'This option will reduce GPU SM usage for the DP/FSDP communication,'
+        'which is improving the performance of the overlapped computation.',
+    )
+    group.add_argument(
+        '--disable-symmetric-registration',
+        action='store_true',
+        dest='disable_symmetric_registration',
+        default=False,
+        help='Disable symmetric (window) registration for NCCL userbuffer registration.'
+        'This option will force to use conventional (local) userbuffer registration when use-nccl-ub is set.',
+    )
+    group.add_argument(
+        '--fsdp-manual-registration',
+        action='store_true',
+        dest='fsdp_manual_registration',
+        default=False,
+        help='Manually register the FSDP communication buffers to NCCL user buffer.'
+        'This option is only effective when use-megatron-fsdp and use-nccl-ub is set.',
+    )
+    group.add_argument(
+        '--create-all-gather-group',
+        action='store_true',
+        help='Create a separate process group for all-gather operations '
+        'to overlap reduce-scatter and all-gather operations.',
+    )
+    group.add_argument(
+        '--data-parallel-sharding-strategy',
+        type=str,
+        default='optim_grads_params',
+        choices=['no_shard', 'optim', 'optim_grads', 'optim_grads_params'],
+        help='Sharding strategy of data parallelism.',
+    )
+    group.add_argument(
+        '--outer-dp-sharding-strategy',
+        type=str,
+        default='no_shard',
+        choices=['no_shard', 'optim'],
+        help='Sharding strategy for outer data parallel group in Hybrid Sharded Data Parallel (HSDP) mode. '
+        'Valid values are "no_shard" (DP Replication) and "optim" (Optimizer State Hybrid Sharding). '
+        'The "optim" option is only supported when --data-parallel-sharding-strategy is "optim_grads_params". '
+        'This option is only effective when Hybrid FSDP is enabled (i.e., when dp_outer_dim is not None). '
+        'Default: "no_shard".',
+    )
+    group.add_argument(
+        '--no-gradient-reduce-div-fusion',
+        action='store_false',
+        dest='gradient_reduce_div_fusion',
+        help='If not set, fuse the division in gradient reduce.',
+    )
+    group.add_argument(
+        '--fsdp-double-buffer',
+        action='store_true',
+        help="Enable double buffering for temporary memory needed for Megatron FSDP communications. "
+        "Double-buffering the communication memory improves memory management efficiency by "
+        "reusing previously allocated buffers, rather than creating new buffers for each FSDP communication. "
+        "This is required for user buffer registration and is enabled by default when using NCCL user buffers.",
+    )
+    group.add_argument(
+        '--suggested-communication-unit-size',
+        type=int,
+        default=None,
+        help='Specifies the number of elements to communicate at once during FSDP (Fully Sharded Data Parallel) operations. '
+        'This flag also affects FSDP all-gather prefetch behavior. Setting a larger value increases the communication buffer size, '
+        'while a smaller value disables prefetching and may degrade performance. Adjust this value based on your system\'s memory '
+        'and performance requirements.',
+    )
+    group.add_argument(
+        '--keep-fp8-transpose-cache',
+        action='store_true',
+        help='If set, keep the fp8 transpose cache when using Megatron FSDP.',
+    )
+    group.add_argument(
+        '--enable-full-sharding-in-hsdp',
+        action='store_true',
+        help='If set, enable full sharding in megatron-fsdp Hybrid Sharded Data Parallel (HSDP) mode.',
+    )
+    group.add_argument(
+        '--num-distributed-optimizer-instances',
+        type=int,
+        default=1,
+        help='Number of Distributed Optimizer copies across Data Parallel domain.',
+    )
+    group.add_argument(
+        '--torch-fsdp2-no-reshard-after-forward',
+        action='store_false',
+        dest='torch_fsdp2_reshard_after_forward',
+        help='Whether to reshard weights after forward pass when using PyTorch FSDP2. '
+        'Set to enable FSDP ZeRO-2.',
+    )
+    group.add_argument(
+        '--cp-comm-type',
+        nargs='+',
+        type=str,
+        default=["p2p"],
+        help='Inter-gpu communication type for context parallelism: '
+        'p2p, a2a, allgather or a2a+p2p. If a single string is provided, '
+        'all layers will share the same communication type. Users can also '
+        'specify separated types for each layer like '
+        '--cp-comm-type p2p p2p a2a a2a a2a+p2p a2a+p2p',
+    )
+    group.add_argument(
+        '--fake-process-group',
+        action='store_true',
+        default=False,
+        help='If set, initialize with fake distributed process group and all distributed communication operations will be skipped. \
                        This is quite useful for profiling memory usage of distributed training with just one GPU. \
-                       Setting WORLD_SIZE and RANK to the specific values for target distribtued scale.')
+                       Setting WORLD_SIZE and RANK to the specific values for target distribtued scale.',
+    )
     return parser
 
 
@@ -2850,194 +3931,366 @@ def _add_validation_args(parser):
 
 def _add_tokenizer_args(parser):
     group = parser.add_argument_group(title='tokenizer')
-    group.add_argument('--vocab-size', type=int, default=None,
-                       help='Size of vocab before EOD or padding.')
-    group.add_argument('--padded-vocab-size', type=int, default=None,
-                       help='Vocabulary size of the model (padded to be divisible by '
-                       'tensor model parallel size). If not provided, it will be '
-                       'automatically calculated from vocab-size.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file.')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file.')
-    group.add_argument('--vocab-extra-ids', type=int, default=0,
-                       help='Number of additional vocabulary tokens. '
-                            'They are used for span masking in the T5 model')
-    group.add_argument('--tokenizer-type', type=str,
-                       default=None,
-                       choices=['BertWordPieceLowerCase',
-                                'BertWordPieceCase',
-                                'GPT2BPETokenizer',
-                                'SentencePieceTokenizer',
-                                'GPTSentencePieceTokenizer',
-                                'HuggingFaceTokenizer',
-                                'Llama2Tokenizer',
-                                'TikTokenizer',
-                                'MultimodalTokenizer',
-                                'NullTokenizer',
-                                'NullMultimodalTokenizer',
-                                'SFTTokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--tokenizer-model', type=str, default=None,
-                       help='Sentencepiece tokenizer model.')
-    group.add_argument('--tokenizer-metadata', type=str, default=None,
-                       help='Path to tokenizer metadata in json format.')
-    group.add_argument('--tokenizer-special-tokens', type=str, nargs='+', default=None,
-                       help='List of special tokens. For TikTokenizer needs to have '
-                            '["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]')
-    group.add_argument('--tiktoken-pattern', type=str, default=None,
-                       help='Which tiktoken pattern to use. Options: [v1, v2]')
-    group.add_argument('--tiktoken-num-special-tokens', type=int, default=1000,
-                       help='Number of special tokens in tiktoken tokenizer')
-    group.add_argument('--tiktoken-special-tokens', type=str, nargs='+', default=None,
-                       help='List of tiktoken special tokens, needs to have '
-                            '["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]')
-    group.add_argument('--tokenizer-sentencepiece-legacy', action='store_true', default=False,
-                       help='SentencePiece tokenizer wrapper legacy behavior. Allows special tokens usage.')
-    group.add_argument('--tokenizer-hf-use-fast', action='store_true', default=True,
-                       help='Whether to use fast HuggingFace tokenizer.')
-    group.add_argument('--tokenizer-hf-include-special-tokens', action='store_true', default=True,
-                       help='Converting text to ids will include special for HuggingFace tokenizer.')
-    group.add_argument('--tokenizer-hf-no-use-fast', action='store_true', default=False,
-                       help='Whether to use fast HuggingFace tokenizer.')
-    group.add_argument('--tokenizer-hf-no-include-special-tokens', action='store_true', default=False,
-                       help='Converting text to ids will not include special for HuggingFace tokenizer.')
-    group.add_argument("--trust-remote-code", action="store_true", default=False,
-                       help='Whether or not to allow PreTrainedTokenizer to execute remote code')
+    group.add_argument(
+        '--vocab-size', type=int, default=None, help='Size of vocab before EOD or padding.'
+    )
+    group.add_argument(
+        '--padded-vocab-size',
+        type=int,
+        default=None,
+        help='Vocabulary size of the model (padded to be divisible by '
+        'tensor model parallel size). If not provided, it will be '
+        'automatically calculated from vocab-size.',
+    )
+    group.add_argument('--vocab-file', type=str, default=None, help='Path to the vocab file.')
+    group.add_argument('--merge-file', type=str, default=None, help='Path to the BPE merge file.')
+    group.add_argument(
+        '--vocab-extra-ids',
+        type=int,
+        default=0,
+        help='Number of additional vocabulary tokens. '
+        'They are used for span masking in the T5 model',
+    )
+    group.add_argument(
+        '--tokenizer-type',
+        type=str,
+        default=None,
+        choices=[
+            'BertWordPieceLowerCase',
+            'BertWordPieceCase',
+            'GPT2BPETokenizer',
+            'SentencePieceTokenizer',
+            'GPTSentencePieceTokenizer',
+            'HuggingFaceTokenizer',
+            'Llama2Tokenizer',
+            'TikTokenizer',
+            'MultimodalTokenizer',
+            'NullTokenizer',
+            'NullMultimodalTokenizer',
+            'SFTTokenizer',
+        ],
+        help='What type of tokenizer to use.',
+    )
+    group.add_argument(
+        '--tokenizer-model', type=str, default=None, help='Sentencepiece tokenizer model.'
+    )
+    group.add_argument(
+        '--tokenizer-metadata',
+        type=str,
+        default=None,
+        help='Path to tokenizer metadata in json format.',
+    )
+    group.add_argument(
+        '--tokenizer-special-tokens',
+        type=str,
+        nargs='+',
+        default=None,
+        help='List of special tokens. For TikTokenizer needs to have '
+        '["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]',
+    )
+    group.add_argument(
+        '--tiktoken-pattern',
+        type=str,
+        default=None,
+        help='Which tiktoken pattern to use. Options: [v1, v2]',
+    )
+    group.add_argument(
+        '--tiktoken-num-special-tokens',
+        type=int,
+        default=1000,
+        help='Number of special tokens in tiktoken tokenizer',
+    )
+    group.add_argument(
+        '--tiktoken-special-tokens',
+        type=str,
+        nargs='+',
+        default=None,
+        help='List of tiktoken special tokens, needs to have '
+        '["<unk>", "<s>", "</s>", "<mask>", "<pad>", "<cls>", "<sep>"]',
+    )
+    group.add_argument(
+        '--tokenizer-sentencepiece-legacy',
+        action='store_true',
+        default=False,
+        help='SentencePiece tokenizer wrapper legacy behavior. Allows special tokens usage.',
+    )
+    group.add_argument(
+        '--tokenizer-hf-use-fast',
+        action='store_true',
+        default=True,
+        help='Whether to use fast HuggingFace tokenizer.',
+    )
+    group.add_argument(
+        '--tokenizer-hf-include-special-tokens',
+        action='store_true',
+        default=True,
+        help='Converting text to ids will include special for HuggingFace tokenizer.',
+    )
+    group.add_argument(
+        '--tokenizer-hf-no-use-fast',
+        action='store_true',
+        default=False,
+        help='Whether to use fast HuggingFace tokenizer.',
+    )
+    group.add_argument(
+        '--tokenizer-hf-no-include-special-tokens',
+        action='store_true',
+        default=False,
+        help='Converting text to ids will not include special for HuggingFace tokenizer.',
+    )
+    group.add_argument(
+        "--trust-remote-code",
+        action="store_true",
+        default=False,
+        help='Whether or not to allow PreTrainedTokenizer to execute remote code',
+    )
     return parser
 
 
 def _add_data_args(parser):
     group = parser.add_argument_group(title='data and dataloader')
 
-    group.add_argument('--data-path', nargs='*', default=None,
-                       help='The weight and prefix list for a set of train, validation, and test'
-                       'datasets which split according to --split. The accepted formats are: '
-                       '(1) a single prefix, '
-                       '(2) a list of weight prefix pairs e.g. weight1 prefix1 weight2 prefix2, '
-                       '(3) a list of prefixes e.g. prefix1 prefix2. '
-                       'For (3), weights are inferred from the lengths of the contributing datasets. '
-                       'This argument is exclusive to the other independent --*-data-path arguments.')
-    group.add_argument('--phase-transition-iterations', type=str, default=None,
-                       help='Comma-separated list of iterations where phase '
-                       'transitions occur. Requires fixed global batch size across phases.')
-    group.add_argument('--split', type=str, default=None,
-                       help='Comma-separated list of proportions for training,'
-                       ' validation, and test split. For example the split '
-                       '`90,5,5` will use 90%% of data for training, 5%% for '
-                       'validation and 5%% for test.')
-    group.add_argument('--train-data-path', nargs='*', default=None,
-                       help='The weight and prefix list for an independent train dataset. '
-                       'Follows the same pattern rules as --data-path.')
-    group.add_argument('--valid-data-path', nargs='*', default=None,
-                       help='The weight and prefix list for an independent validation dataset. '
-                       'Follows the same pattern rules as --data-path.')
-    group.add_argument('--test-data-path', nargs='*', default=None,
-                       help='The weight and prefix list for an independent test dataset. '
-                       'Follows the same pattern rules as --data-path.')
-    group.add_argument('--data-args-path', type=str, default=None,
-                       help='Path to data-args. Instead of feeding `--data-path` '
-                       'with weighted dataset, we pass in a file path from which '
-                       'we read that argument. This is useful when the list of data is '
-                       'too big.')
-    group.add_argument('--per-split-data-args-path', type=str, default=None,
-                       help='Path to per-split-data-args. Instead of feeding '
-                       '`--(train|valid|test)-data-path` with weighted dataset, '
-                       'we pass in a file path from which we read those arguments. '
-                       'This is useful when the list of data is too big. Format is a '
-                       'json file with `train`, `valid, `test` keys')
-    group.add_argument('--per-dataset-sequences-path', default=None,
-                       help='Path to a json file with the sequences per dataset. Check the tools/build_sequences_per_dataset.py script to build this file.')
-    group.add_argument('--dataloader-fast-cache-load', action='store_true',
-                       help='Option to use the fast cache loading path when building the datasets. Requires all the dataset caches to be built and stored in --data-cache-path.')
-    group.add_argument('--dataloader-defer-npy-index-mmap', action='store_true',
-                       help='Defer the mmap of the dataset indexes (.npy files) until the first access. Requires all the dataset caches to be built and stored in --data-cache-path.')
-    group.add_argument('--data-cache-path', default=None,
-                       help='Path to a directory to hold cached index files.')
-    group.add_argument('--no-mmap-bin-files', action='store_false',
-                       help='Disable mmap-ing of .bin files.',
-                       dest='mmap_bin_files')
-    group.add_argument('--mock-data', action='store_true',
-                       help='Skip data loading and validation and opt for artificial '
-                       'generation of mock data when an implementation is available.')
-    group.add_argument('--seq-length', type=int, default=None,
-                       help='Maximum sequence length to process.')
-    group.add_argument('--encoder-seq-length', type=int, default=None,
-                       help='Maximum encoder sequence length to process.'
-                       'This should be exclusive of --seq-length')
-    group.add_argument('--decoder-seq-length', type=int, default=None,
-                       help="Maximum decoder sequence length to process.")
-    group.add_argument('--sample-rate', type=float, default=1.0,
-                       help='sample rate for training data. Supposed to be 0 '
-                            ' < sample_rate < 1')
-    group.add_argument('--mask-prob', type=float, default=0.15,
-                       help='Probability of replacing a token with mask.')
-    group.add_argument('--short-seq-prob', type=float, default=0.1,
-                       help='Probability of producing a short sequence.')
-    group.add_argument('--num-workers', type=int, default=2,
-                       help="Dataloader number of workers.")
-    group.add_argument('--reset-position-ids', action='store_true',
-                       help='Reset posistion ids after end-of-document token.')
-    group.add_argument('--reset-attention-mask', action='store_true',
-                       help='Reset self attention mask after '
-                       'end-of-document token.')
-    group.add_argument('--eod-mask-loss', action='store_true',
-                       help='Mask loss for the end of document tokens.')
-    group.add_argument('--no-create-attention-mask-in-dataloader', action='store_false',
-                       help='If set, do not create attention_masks in dataloader.',
-                       dest='create_attention_mask_in_dataloader')
-    group.add_argument('--num-dataset-builder-threads', type=int, default=1,
-                       help='Number of parallel threads per rank for dataset builder')
-    group.add_argument('--object-storage-cache-path', type=str, default=None,
-                       help='Path to cache index files when using s3 or msc dataloader')
-    group.add_argument('--mid-level-dataset-surplus', type=float, default=0.005,
-                       help='The sample surplus to build for the mid-level datasets(s)')
-    group.add_argument('--allow-ambiguous-pad-tokens', action='store_true',
-                       help='Whether to prevent pad tokens already present in the dataset '
-                       'from being masked out when the pad token incorrectly shares the same id '
-                       'with other special tokens in the tokenizer. Note that this argument has '
-                       'no effect when the tokenizer correctly provides a unique id for the pad. '
-                       'Masking out such ambiguous pad tokens results in training instability. '
-                       'Such a scenario is best resolved by fixing the tokenizer; leaving this '
-                       'option as False provides a workaround. '
-                       'When left to the default of False, any token ids that collide with the '
-                       'pad token id - as provided by the tokenizer - will not be masked out of '
-                       'the loss calculation: it cannot be determined whether they are truly pad. '
-                       'If instead this argument is set, the training flow will treat all tokens '
-                       'that share the same id as the pad token as true pad tokens, potentially '
-                       'causing severe training instability.')
+    group.add_argument(
+        '--data-path',
+        nargs='*',
+        default=None,
+        help='The weight and prefix list for a set of train, validation, and test'
+        'datasets which split according to --split. The accepted formats are: '
+        '(1) a single prefix, '
+        '(2) a list of weight prefix pairs e.g. weight1 prefix1 weight2 prefix2, '
+        '(3) a list of prefixes e.g. prefix1 prefix2. '
+        'For (3), weights are inferred from the lengths of the contributing datasets. '
+        'This argument is exclusive to the other independent --*-data-path arguments.',
+    )
+    group.add_argument(
+        '--phase-transition-iterations',
+        type=str,
+        default=None,
+        help='Comma-separated list of iterations where phase '
+        'transitions occur. Requires fixed global batch size across phases.',
+    )
+    group.add_argument(
+        '--split',
+        type=str,
+        default=None,
+        help='Comma-separated list of proportions for training,'
+        ' validation, and test split. For example the split '
+        '`90,5,5` will use 90%% of data for training, 5%% for '
+        'validation and 5%% for test.',
+    )
+    group.add_argument(
+        '--train-data-path',
+        nargs='*',
+        default=None,
+        help='The weight and prefix list for an independent train dataset. '
+        'Follows the same pattern rules as --data-path.',
+    )
+    group.add_argument(
+        '--valid-data-path',
+        nargs='*',
+        default=None,
+        help='The weight and prefix list for an independent validation dataset. '
+        'Follows the same pattern rules as --data-path.',
+    )
+    group.add_argument(
+        '--test-data-path',
+        nargs='*',
+        default=None,
+        help='The weight and prefix list for an independent test dataset. '
+        'Follows the same pattern rules as --data-path.',
+    )
+    group.add_argument(
+        '--data-args-path',
+        type=str,
+        default=None,
+        help='Path to data-args. Instead of feeding `--data-path` '
+        'with weighted dataset, we pass in a file path from which '
+        'we read that argument. This is useful when the list of data is '
+        'too big.',
+    )
+    group.add_argument(
+        '--per-split-data-args-path',
+        type=str,
+        default=None,
+        help='Path to per-split-data-args. Instead of feeding '
+        '`--(train|valid|test)-data-path` with weighted dataset, '
+        'we pass in a file path from which we read those arguments. '
+        'This is useful when the list of data is too big. Format is a '
+        'json file with `train`, `valid, `test` keys',
+    )
+    group.add_argument(
+        '--per-dataset-sequences-path',
+        default=None,
+        help='Path to a json file with the sequences per dataset. Check the tools/build_sequences_per_dataset.py script to build this file.',
+    )
+    group.add_argument(
+        '--dataloader-fast-cache-load',
+        action='store_true',
+        help='Option to use the fast cache loading path when building the datasets. Requires all the dataset caches to be built and stored in --data-cache-path.',
+    )
+    group.add_argument(
+        '--dataloader-defer-npy-index-mmap',
+        action='store_true',
+        help='Defer the mmap of the dataset indexes (.npy files) until the first access. Requires all the dataset caches to be built and stored in --data-cache-path.',
+    )
+    group.add_argument(
+        '--data-cache-path', default=None, help='Path to a directory to hold cached index files.'
+    )
+    group.add_argument(
+        '--no-mmap-bin-files',
+        action='store_false',
+        help='Disable mmap-ing of .bin files.',
+        dest='mmap_bin_files',
+    )
+    group.add_argument(
+        '--mock-data',
+        action='store_true',
+        help='Skip data loading and validation and opt for artificial '
+        'generation of mock data when an implementation is available.',
+    )
+    group.add_argument(
+        '--seq-length', type=int, default=None, help='Maximum sequence length to process.'
+    )
+    group.add_argument(
+        '--encoder-seq-length',
+        type=int,
+        default=None,
+        help='Maximum encoder sequence length to process.'
+        'This should be exclusive of --seq-length',
+    )
+    group.add_argument(
+        '--decoder-seq-length',
+        type=int,
+        default=None,
+        help="Maximum decoder sequence length to process.",
+    )
+    group.add_argument(
+        '--sample-rate',
+        type=float,
+        default=1.0,
+        help='sample rate for training data. Supposed to be 0 ' ' < sample_rate < 1',
+    )
+    group.add_argument(
+        '--mask-prob', type=float, default=0.15, help='Probability of replacing a token with mask.'
+    )
+    group.add_argument(
+        '--short-seq-prob',
+        type=float,
+        default=0.1,
+        help='Probability of producing a short sequence.',
+    )
+    group.add_argument('--num-workers', type=int, default=2, help="Dataloader number of workers.")
+    group.add_argument(
+        '--reset-position-ids',
+        action='store_true',
+        help='Reset posistion ids after end-of-document token.',
+    )
+    group.add_argument(
+        '--reset-attention-mask',
+        action='store_true',
+        help='Reset self attention mask after ' 'end-of-document token.',
+    )
+    group.add_argument(
+        '--eod-mask-loss', action='store_true', help='Mask loss for the end of document tokens.'
+    )
+    group.add_argument(
+        '--no-create-attention-mask-in-dataloader',
+        action='store_false',
+        help='If set, do not create attention_masks in dataloader.',
+        dest='create_attention_mask_in_dataloader',
+    )
+    group.add_argument(
+        '--num-dataset-builder-threads',
+        type=int,
+        default=1,
+        help='Number of parallel threads per rank for dataset builder',
+    )
+    group.add_argument(
+        '--object-storage-cache-path',
+        type=str,
+        default=None,
+        help='Path to cache index files when using s3 or msc dataloader',
+    )
+    group.add_argument(
+        '--mid-level-dataset-surplus',
+        type=float,
+        default=0.005,
+        help='The sample surplus to build for the mid-level datasets(s)',
+    )
+    group.add_argument(
+        '--allow-ambiguous-pad-tokens',
+        action='store_true',
+        help='Whether to prevent pad tokens already present in the dataset '
+        'from being masked out when the pad token incorrectly shares the same id '
+        'with other special tokens in the tokenizer. Note that this argument has '
+        'no effect when the tokenizer correctly provides a unique id for the pad. '
+        'Masking out such ambiguous pad tokens results in training instability. '
+        'Such a scenario is best resolved by fixing the tokenizer; leaving this '
+        'option as False provides a workaround. '
+        'When left to the default of False, any token ids that collide with the '
+        'pad token id - as provided by the tokenizer - will not be masked out of '
+        'the loss calculation: it cannot be determined whether they are truly pad. '
+        'If instead this argument is set, the training flow will treat all tokens '
+        'that share the same id as the pad token as true pad tokens, potentially '
+        'causing severe training instability.',
+    )
     group.add_argument('--fim-data', action='store_true', help='Whether to use the FIM dataset.')
-    group.add_argument('--fim-rate', type=float, default=0.5,
-                       help='Probability to convert a training sample into a FIM format.')
-    group.add_argument('--fim-spm-rate', type=float, default=0.5,
-                       help='Probability that the a FIM sample uses the SPM format over the PSM format.')
-    group.add_argument('--fim-split-sample', type=str, default=None,
-                       help='String around which to split the sample for FIM.')
-    group.add_argument('--fim-fragment-rate', type=float, default=None,
-                       help='Rate of FIM on each fragment when --fim-split-sample is not None.')
-    group.add_argument('--fim-no-prefix', type=str, default=None,
-                       help='Do not apply FIM to fragments that start with this prefix')
-    group.add_argument('--fim-prefix-token', type=str, default='<fim_prefix>',
-                       help='FIM prefix token')
-    group.add_argument('--fim-middle-token', type=str, default='<fim_middle>',
-                       help='FIM middle token')
-    group.add_argument('--fim-suffix-token', type=str, default='<fim_suffix>',
-                       help='FIM suffix token')
-    group.add_argument('--fim-pad-token', type=str, default='<fim_pad>',
-                       help='FIM PAD token')
-    group.add_argument('--fim-eod-token', type=str, default='<|endoftext|>',
-                       help='FIM EOD token')
+    group.add_argument(
+        '--fim-rate',
+        type=float,
+        default=0.5,
+        help='Probability to convert a training sample into a FIM format.',
+    )
+    group.add_argument(
+        '--fim-spm-rate',
+        type=float,
+        default=0.5,
+        help='Probability that the a FIM sample uses the SPM format over the PSM format.',
+    )
+    group.add_argument(
+        '--fim-split-sample',
+        type=str,
+        default=None,
+        help='String around which to split the sample for FIM.',
+    )
+    group.add_argument(
+        '--fim-fragment-rate',
+        type=float,
+        default=None,
+        help='Rate of FIM on each fragment when --fim-split-sample is not None.',
+    )
+    group.add_argument(
+        '--fim-no-prefix',
+        type=str,
+        default=None,
+        help='Do not apply FIM to fragments that start with this prefix',
+    )
+    group.add_argument(
+        '--fim-prefix-token', type=str, default='<fim_prefix>', help='FIM prefix token'
+    )
+    group.add_argument(
+        '--fim-middle-token', type=str, default='<fim_middle>', help='FIM middle token'
+    )
+    group.add_argument(
+        '--fim-suffix-token', type=str, default='<fim_suffix>', help='FIM suffix token'
+    )
+    group.add_argument('--fim-pad-token', type=str, default='<fim_pad>', help='FIM PAD token')
+    group.add_argument('--fim-eod-token', type=str, default='<|endoftext|>', help='FIM EOD token')
     return parser
 
 
 def _add_autoresume_args(parser):
     group = parser.add_argument_group(title='autoresume')
 
-    group.add_argument('--adlr-autoresume', action='store_true',
-                       help='Enable autoresume on adlr cluster.')
-    group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
-                       help='Intervals over which check for autoresume'
-                       'termination signal')
+    group.add_argument(
+        '--adlr-autoresume', action='store_true', help='Enable autoresume on adlr cluster.'
+    )
+    group.add_argument(
+        '--adlr-autoresume-interval',
+        type=int,
+        default=1000,
+        help='Intervals over which check for autoresume' 'termination signal',
+    )
 
     return parser
 
@@ -3046,56 +4299,95 @@ def _add_biencoder_args(parser):
     group = parser.add_argument_group(title='biencoder')
 
     # network size
-    group.add_argument('--ict-head-size', type=int, default=None,
-                       help='Size of block embeddings to be used in ICT and '
-                        'REALM (paper default: 128)')
-    group.add_argument('--biencoder-projection-dim', type=int, default=0,
-                       help='Size of projection head used in biencoder (paper'
-                        ' default: 128)')
-    group.add_argument('--biencoder-shared-query-context-model', action='store_true',
-                        help='Whether to share the parameters of the query '
-                        'and context models or not')
+    group.add_argument(
+        '--ict-head-size',
+        type=int,
+        default=None,
+        help='Size of block embeddings to be used in ICT and ' 'REALM (paper default: 128)',
+    )
+    group.add_argument(
+        '--biencoder-projection-dim',
+        type=int,
+        default=0,
+        help='Size of projection head used in biencoder (paper' ' default: 128)',
+    )
+    group.add_argument(
+        '--biencoder-shared-query-context-model',
+        action='store_true',
+        help='Whether to share the parameters of the query ' 'and context models or not',
+    )
 
     # checkpointing
-    group.add_argument('--ict-load', type=str, default=None,
-                       help='Directory containing an ICTBertModel checkpoint')
-    group.add_argument('--bert-load', type=str, default=None,
-                       help='Directory containing an BertModel checkpoint '
-                       '(needed to start ICT and REALM)')
+    group.add_argument(
+        '--ict-load', type=str, default=None, help='Directory containing an ICTBertModel checkpoint'
+    )
+    group.add_argument(
+        '--bert-load',
+        type=str,
+        default=None,
+        help='Directory containing an BertModel checkpoint ' '(needed to start ICT and REALM)',
+    )
 
     # data
-    group.add_argument('--titles-data-path', type=str, default=None,
-                       help='Path to titles dataset used for ICT')
-    group.add_argument('--query-in-block-prob', type=float, default=0.1,
-                       help='Probability of keeping query in block for '
-                       'ICT dataset')
-    group.add_argument('--use-one-sent-docs', action='store_true',
-                       help='Whether to use one sentence documents in ICT')
-    group.add_argument('--evidence-data-path', type=str, default=None,
-                       help='Path to Wikipedia Evidence frm DPR paper')
+    group.add_argument(
+        '--titles-data-path', type=str, default=None, help='Path to titles dataset used for ICT'
+    )
+    group.add_argument(
+        '--query-in-block-prob',
+        type=float,
+        default=0.1,
+        help='Probability of keeping query in block for ' 'ICT dataset',
+    )
+    group.add_argument(
+        '--use-one-sent-docs',
+        action='store_true',
+        help='Whether to use one sentence documents in ICT',
+    )
+    group.add_argument(
+        '--evidence-data-path',
+        type=str,
+        default=None,
+        help='Path to Wikipedia Evidence frm DPR paper',
+    )
 
     # training
-    group.add_argument('--retriever-report-topk-accuracies', nargs='+', type=int,
-                        default=[], help="Which top-k accuracies to report "
-                        "(e.g. '1 5 20')")
-    group.add_argument('--retriever-score-scaling', action='store_true',
-                       help='Whether to scale retriever scores by inverse '
-                        'square root of hidden size')
+    group.add_argument(
+        '--retriever-report-topk-accuracies',
+        nargs='+',
+        type=int,
+        default=[],
+        help="Which top-k accuracies to report " "(e.g. '1 5 20')",
+    )
+    group.add_argument(
+        '--retriever-score-scaling',
+        action='store_true',
+        help='Whether to scale retriever scores by inverse ' 'square root of hidden size',
+    )
 
     # faiss index
-    group.add_argument('--block-data-path', type=str, default=None,
-                       help='Where to save/load BlockData to/from')
-    group.add_argument('--embedding-path', type=str, default=None,
-                       help='Where to save/load Open-Retrieval Embedding'
-                        ' data to/from')
+    group.add_argument(
+        '--block-data-path', type=str, default=None, help='Where to save/load BlockData to/from'
+    )
+    group.add_argument(
+        '--embedding-path',
+        type=str,
+        default=None,
+        help='Where to save/load Open-Retrieval Embedding' ' data to/from',
+    )
 
     # indexer
-    group.add_argument('--indexer-batch-size', type=int, default=128,
-                       help='How large of batches to use when doing indexing '
-                       'jobs')
-    group.add_argument('--indexer-log-interval', type=int, default=1000,
-                       help='After how many batches should the indexer '
-                       'report progress')
+    group.add_argument(
+        '--indexer-batch-size',
+        type=int,
+        default=128,
+        help='How large of batches to use when doing indexing ' 'jobs',
+    )
+    group.add_argument(
+        '--indexer-log-interval',
+        type=int,
+        default=1000,
+        help='After how many batches should the indexer ' 'report progress',
+    )
     return parser
 
 
@@ -3103,144 +4395,265 @@ def _add_vision_args(parser):
     group = parser.add_argument_group(title="vision")
 
     # general vision arguements
-    group.add_argument('--num-classes', type=int, default=1000,
-                       help='num of classes in vision classificaiton task')
-    group.add_argument('--img-h', type=int, default=224,
-                       help='Image height for vision classification task')
-    group.add_argument('--img-w', type=int, default=224,
-                       help='Image height for vision classification task')
-    group.add_argument('--num-channels', type=int, default=3,
-                       help='Number of channels in input image data')
-    group.add_argument('--patch-dim', type=int, default=16,
-                       help='patch dimension')
-    group.add_argument('--classes-fraction', type=float, default=1.0,
-                       help='training with fraction of classes.')
-    group.add_argument('--data-per-class-fraction', type=float, default=1.0,
-                       help='training with fraction of data per class.')
-    group.add_argument('--no-data-sharding', action='store_false',
-                       help='Disable data sharding.',
-                       dest='data_sharding')
-    group.add_argument('--head-lr-mult', type=float, default=1.0,
-                       help='learning rate multiplier for head during finetuning')
+    group.add_argument(
+        '--num-classes', type=int, default=1000, help='num of classes in vision classificaiton task'
+    )
+    group.add_argument(
+        '--img-h', type=int, default=224, help='Image height for vision classification task'
+    )
+    group.add_argument(
+        '--img-w', type=int, default=224, help='Image height for vision classification task'
+    )
+    group.add_argument(
+        '--num-channels', type=int, default=3, help='Number of channels in input image data'
+    )
+    group.add_argument('--patch-dim', type=int, default=16, help='patch dimension')
+    group.add_argument(
+        '--classes-fraction', type=float, default=1.0, help='training with fraction of classes.'
+    )
+    group.add_argument(
+        '--data-per-class-fraction',
+        type=float,
+        default=1.0,
+        help='training with fraction of data per class.',
+    )
+    group.add_argument(
+        '--no-data-sharding',
+        action='store_false',
+        help='Disable data sharding.',
+        dest='data_sharding',
+    )
+    group.add_argument(
+        '--head-lr-mult',
+        type=float,
+        default=1.0,
+        help='learning rate multiplier for head during finetuning',
+    )
 
     # pretraining type and backbone selection`
-    group.add_argument('--vision-pretraining', action='store_true',
-                       help='flag to indicate vision pretraining')
-    group.add_argument('--vision-pretraining-type', type=str, default='classify',
-                       choices=['classify', 'inpaint', 'dino'],
-                       help='pretraining objectives')
-    group.add_argument('--vision-backbone-type', type=str, default='vit',
-                       choices=['vit', 'mit', 'swin'],
-                       help='backbone types types')
-    group.add_argument('--swin-backbone-type', type=str, default='tiny',
-                       choices=['tiny', 'base', 'h3'],
-                       help='pretraining objectives')
+    group.add_argument(
+        '--vision-pretraining', action='store_true', help='flag to indicate vision pretraining'
+    )
+    group.add_argument(
+        '--vision-pretraining-type',
+        type=str,
+        default='classify',
+        choices=['classify', 'inpaint', 'dino'],
+        help='pretraining objectives',
+    )
+    group.add_argument(
+        '--vision-backbone-type',
+        type=str,
+        default='vit',
+        choices=['vit', 'mit', 'swin'],
+        help='backbone types types',
+    )
+    group.add_argument(
+        '--swin-backbone-type',
+        type=str,
+        default='tiny',
+        choices=['tiny', 'base', 'h3'],
+        help='pretraining objectives',
+    )
     # inpainting arguments
-    group.add_argument('--mask-type', type=str, default='random',
-                       choices=['random', 'row'],
-                       help='mask types')
-    group.add_argument('--mask-factor', type=float, default=1.0,
-                       help='mask size scaling parameter')
+    group.add_argument(
+        '--mask-type', type=str, default='random', choices=['random', 'row'], help='mask types'
+    )
+    group.add_argument('--mask-factor', type=float, default=1.0, help='mask size scaling parameter')
 
     # dino arguments
-    group.add_argument('--iter-per-epoch', type=int, default=1250,
-                       help='iterations per epoch')
-    group.add_argument('--dino-local-img-size', type=int, default=96,
-                       help='Image size for vision classification task')
-    group.add_argument('--dino-local-crops-number', type=int, default=10,
-                       help='Number of local crops')
-    group.add_argument('--dino-head-hidden-size', type=int, default=2048,
-                       help='Hidden dimension size in dino head')
-    group.add_argument('--dino-bottleneck-size', type=int, default=256,
-                       help='Bottle neck dimension in dino head ')
-    group.add_argument('--dino-freeze-last-layer', type=float, default=1,
-                       help='Freezing last layer weights')
-    group.add_argument('--dino-norm-last-layer', action='store_true',
-                       help='Disable Norm in last layer.')
-    group.add_argument('--dino-warmup-teacher-temp', type=float, default=0.04,
-                       help='warump teacher temperature')
-    group.add_argument('--dino-teacher-temp', type=float, default=0.07,
-                       help='teacher temperature')
-    group.add_argument('--dino-warmup-teacher-temp-epochs', type=int, default=30,
-                       help='warmup teacher temperaure epochs')
+    group.add_argument('--iter-per-epoch', type=int, default=1250, help='iterations per epoch')
+    group.add_argument(
+        '--dino-local-img-size',
+        type=int,
+        default=96,
+        help='Image size for vision classification task',
+    )
+    group.add_argument(
+        '--dino-local-crops-number', type=int, default=10, help='Number of local crops'
+    )
+    group.add_argument(
+        '--dino-head-hidden-size', type=int, default=2048, help='Hidden dimension size in dino head'
+    )
+    group.add_argument(
+        '--dino-bottleneck-size', type=int, default=256, help='Bottle neck dimension in dino head '
+    )
+    group.add_argument(
+        '--dino-freeze-last-layer', type=float, default=1, help='Freezing last layer weights'
+    )
+    group.add_argument(
+        '--dino-norm-last-layer', action='store_true', help='Disable Norm in last layer.'
+    )
+    group.add_argument(
+        '--dino-warmup-teacher-temp', type=float, default=0.04, help='warump teacher temperature'
+    )
+    group.add_argument('--dino-teacher-temp', type=float, default=0.07, help='teacher temperature')
+    group.add_argument(
+        '--dino-warmup-teacher-temp-epochs',
+        type=int,
+        default=30,
+        help='warmup teacher temperaure epochs',
+    )
 
     return parser
 
+
 def _add_moe_args(parser):
     group = parser.add_argument_group(title="moe")
     # General arguments
-    group.add_argument('--num-experts', type=int, default=None,
-                       help='Number of Experts in MoE (None means no MoE)')
-    group.add_argument('--moe-layer-freq', type=moe_freq_type, default=1,
-                       help='Frequency between MoE layers and Dense layers. Accepts either: '
-                            '- An integer N: Represents a 1:N ratio, meaning one expert layer for every N-1 dense layers '
-                            '- A string containing a Python list expression that defines a custom pattern, e.g.: '
-                            '"([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] '
-                            'where 1 indicates an expert layer and 0 indicates a dense layer. '
-                            'Examples: "([0]+[1]*23)": 1 dense layer followed by 23 expert layers, '
-                            '"([1]*3+[0]*2)*2": Three expert layers followed by two dense layers, repeated twice.')
-    group.add_argument('--moe-use-upcycling', action='store_true',
-                       help='Load a checkpoint of a dense model, convert it into an MoE model, and save the converted model to the path specified by --save. '
-                       'Upcycling is implemented on the top of distributed checkpointing, so it supports parallel modes different from the dense model.')
+    group.add_argument(
+        '--num-experts', type=int, default=None, help='Number of Experts in MoE (None means no MoE)'
+    )
+    group.add_argument(
+        '--moe-layer-freq',
+        type=moe_freq_type,
+        default=1,
+        help='Frequency between MoE layers and Dense layers. Accepts either: '
+        '- An integer N: Represents a 1:N ratio, meaning one expert layer for every N-1 dense layers '
+        '- A string containing a Python list expression that defines a custom pattern, e.g.: '
+        '"([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] '
+        'where 1 indicates an expert layer and 0 indicates a dense layer. '
+        'Examples: "([0]+[1]*23)": 1 dense layer followed by 23 expert layers, '
+        '"([1]*3+[0]*2)*2": Three expert layers followed by two dense layers, repeated twice.',
+    )
+    group.add_argument(
+        '--moe-use-upcycling',
+        action='store_true',
+        help='Load a checkpoint of a dense model, convert it into an MoE model, and save the converted model to the path specified by --save. '
+        'Upcycling is implemented on the top of distributed checkpointing, so it supports parallel modes different from the dense model.',
+    )
     # Router arguments
-    group.add_argument('--moe-router-load-balancing-type', nargs='+', type=str,
-                       choices=['aux_loss', 'seq_aux_loss', 'global_aux_loss', 'sinkhorn', 'none'],
-                       default='aux_loss',
-                       help='Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss".')
-    group.add_argument('--moe-aux-loss-coeff', type=float, nargs='+', default=0.0,
-                       help='Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended.')
+    group.add_argument(
+        '--moe-router-load-balancing-type',
+        nargs='+',
+        type=str,
+        choices=['aux_loss', 'seq_aux_loss', 'global_aux_loss', 'sinkhorn', 'none'],
+        default='aux_loss',
+        help='Determines the load balancing strategy for the router. "aux_loss" corresponds to the load balancing loss used in GShard and SwitchTransformer; "seq_aux_loss" corresponds to the load balancing loss used in DeepSeekV2, which computes the loss for each individual sample; "sinkhorn" corresponds to the balancing algorithm used in S-BASE, and "none" implies no load balancing. The default is "aux_loss".',
+    )
+    group.add_argument(
+        '--moe-aux-loss-coeff',
+        type=float,
+        nargs='+',
+        default=0.0,
+        help='Scaling coefficient for the aux loss: a starting value of 1e-2 is recommended.',
+    )
     # Token dispatcher arguments
     # MoE communication overlap arguments
 
-    group.add_argument('--moe-upcycling-granularity', type=int, default=1,
-                       help='This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. '
-                       'For using granular upcycling strategy, please set this param as a positive integer. If this param is set to 1, it means using the default upcycling strategy.')
+    group.add_argument(
+        '--moe-upcycling-granularity',
+        type=int,
+        default=1,
+        help='This param sepecifics how many times smaller is the expert hidden size compared with the original dense FFN hidden size. '
+        'For using granular upcycling strategy, please set this param as a positive integer. If this param is set to 1, it means using the default upcycling strategy.',
+    )
     return parser
 
+
 def _add_mla_args(parser):
     group = parser.add_argument_group(title="mla")
-    group.add_argument('--q-lora-rank', type=int, default=None,
-                       help="Rank of Query tensor's low rank representation.")
-    group.add_argument('--kv-lora-rank', type=int, default=32,
-                       help="Rank of Key and Value tensors' low rank representation.")
-    group.add_argument('--qk-head-dim', type=int, default=128,
-                       help="Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim")
-    group.add_argument('--qk-pos-emb-head-dim', type=int, default=64,
-                       help="Dimension of the position embedding in the QK projection.")
-    group.add_argument('--v-head-dim', type=int, default=128,
-                       help="Dimension of the head in the V projection.")
-    group.add_argument('--rotary-scaling-factor', type=float, default=1.0,
-                       help="Rotary scaling factor for the rotary embeddings.")
-    group.add_argument('--mscale', type=float, default=1.0,
-                       help="Mscale for YaRN RoPE in multi-latent attention.")
-    group.add_argument('--mscale-all-dim', type=float, default=0.0,
-                       help="Mscale all dimensions for YaRN RoPE in multi-latent attention.")
-    group.add_argument('--cache-mla-latents', action='store_true', default=False,
-                       help="If set caches the mla down projected latents with mla flash decode.")
+    group.add_argument(
+        '--q-lora-rank',
+        type=int,
+        default=None,
+        help="Rank of Query tensor's low rank representation.",
+    )
+    group.add_argument(
+        '--kv-lora-rank',
+        type=int,
+        default=32,
+        help="Rank of Key and Value tensors' low rank representation.",
+    )
+    group.add_argument(
+        '--qk-head-dim',
+        type=int,
+        default=128,
+        help="Dimension of the head in the QK projection. q_head_dim = qk_head_dim + qk_pos_emb_head_dim",
+    )
+    group.add_argument(
+        '--qk-pos-emb-head-dim',
+        type=int,
+        default=64,
+        help="Dimension of the position embedding in the QK projection.",
+    )
+    group.add_argument(
+        '--v-head-dim', type=int, default=128, help="Dimension of the head in the V projection."
+    )
+    group.add_argument(
+        '--rotary-scaling-factor',
+        type=float,
+        default=1.0,
+        help="Rotary scaling factor for the rotary embeddings.",
+    )
+    group.add_argument(
+        '--mscale', type=float, default=1.0, help="Mscale for YaRN RoPE in multi-latent attention."
+    )
+    group.add_argument(
+        '--mscale-all-dim',
+        type=float,
+        default=0.0,
+        help="Mscale all dimensions for YaRN RoPE in multi-latent attention.",
+    )
+    group.add_argument(
+        '--o-groups',
+        type=int,
+        default=8,
+        help="Number of groups for grouped output (wo_a). 0 = single linear."
+    )
+    group.add_argument(
+        '--o-lora-rank',
+        type=int,
+        default=1024,
+        help="Low-rank dimension per group for grouped output (wo_a). Used when o-groups > 0."
+    )
+    group.add_argument(
+        '--cache-mla-latents',
+        action='store_true',
+        default=False,
+        help="If set caches the mla down projected latents with mla flash decode.",
+    )
     group.add_argument(
         '--mla-down-proj-fusion',
         action='store_true',
         default=False,
         help="Enable fused q/kv down-projection and fused input layernorm when backend supports. "
-             "Otherwise fall back to the unfused MLA.",
+        "Otherwise fall back to the unfused MLA.",
     )
 
     return parser
 
+
 def _add_experimental_attention_variant_args(parser):
     group = parser.add_argument_group(title="experimental_attention_variant")
     # Linear attention
-    group.add_argument('--linear-attention-freq', type=la_freq_type, default=None,
-                       help='Frequency between LA (linear attention) layers and'
-                            ' SDPA (scaled dot-product attention) layers. Accepts either: '
-                            '- An integer N: Represents a (N-1):N ratio, meaning (N-1) LA layers for every 1 SDPA layer '
-                            '- A string containing a Python list expression that defines a custom pattern, e.g.: '
-                            '"([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] '
-                            'where 1 indicates an LA layer and 0 indicates a SDPA layer. '
-                            'Examples: "([0]+[1]*23)": 1 SDPA layer followed by 23 LA layers, '
-                            '"([1]*3+[0]*2)*2": Three LA layers followed by two SDPA layers, repeated twice.')
+    group.add_argument(
+        '--linear-attention-freq',
+        type=la_freq_type,
+        default=None,
+        help='Frequency between LA (linear attention) layers and'
+        ' SDPA (scaled dot-product attention) layers. Accepts either: '
+        '- An integer N: Represents a (N-1):N ratio, meaning (N-1) LA layers for every 1 SDPA layer '
+        '- A string containing a Python list expression that defines a custom pattern, e.g.: '
+        '"([1]*3+[0]*1)*3" evaluates to [1,1,1,0,1,1,1,0,1,1,1,0] '
+        'where 1 indicates an LA layer and 0 indicates a SDPA layer. '
+        'Examples: "([0]+[1]*23)": 1 SDPA layer followed by 23 LA layers, '
+        '"([1]*3+[0]*2)*2": Three LA layers followed by two SDPA layers, repeated twice.',
+    )
+    group.add_argument(
+        '--csa-compress-ratios',
+        type=compress_ratios_type,
+        default=None,
+        help='Per-layer compress ratios for compressed sparse attention. '
+            'Accepts a string containing a Python list expression, e.g.: '
+            '"[0,0,4,128,4,128]" or "([0]+[4,128]*2)*3". '
+            'Each value is the compression ratio for the corresponding '
+            'transformer layer (valid values: 0, 4, 128). '
+            'The list length must equal num_layers.'
+    )
     return parser
 
+
 def _add_heterogeneous_args(parser):
     """
     Heterogeneous models refer to transformer architectures where individual layers can differ
@@ -3280,94 +4693,155 @@ def _add_heterogeneous_args(parser):
     ]
     """
     group = parser.add_argument_group(title="heterogeneous architecture")
-    group.add_argument('--heterogeneous-layers-config-path', type=str, default=None,
-                       help='Path to json file containing heterogeneous model configuration. '
-                       'Use the format of the HuggingFace config files in llama nemotron '
-                       'models, e.g. https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1/resolve/main/config.json.')
-    group.add_argument('--heterogeneous-layers-config-encoded-json', type=str, default=None,
-                       help='This is encoded json string of the heterogeneous model configuration. Used to keep the content '
-                       'of the heterogeneous model specification in args when the model is loaded from a checkpoint. '
-                       'Use the format of the HuggingFace config files in llama nemotron '
-                       'models, e.g. https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1/resolve/main/config.json.')
+    group.add_argument(
+        '--heterogeneous-layers-config-path',
+        type=str,
+        default=None,
+        help='Path to json file containing heterogeneous model configuration. '
+        'Use the format of the HuggingFace config files in llama nemotron '
+        'models, e.g. https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1/resolve/main/config.json.',
+    )
+    group.add_argument(
+        '--heterogeneous-layers-config-encoded-json',
+        type=str,
+        default=None,
+        help='This is encoded json string of the heterogeneous model configuration. Used to keep the content '
+        'of the heterogeneous model specification in args when the model is loaded from a checkpoint. '
+        'Use the format of the HuggingFace config files in llama nemotron '
+        'models, e.g. https://huggingface.co/nvidia/Llama-3_3-Nemotron-Super-49B-v1/resolve/main/config.json.',
+    )
     return parser
 
+
 def _add_experimental_args(parser):
     group = parser.add_argument_group(title='experimental')
 
-    group.add_argument('--enable-experimental', action='store_true',
-                       help='Enable experimental features.')
-    group.add_argument('--spec', type=str, default=None, nargs='*',
-                       help='Specify the <module_location function_name> pair '
-                       'that returns a spec to customize a model, transformer '
-                       'block, or transformer layer, depending on the use case.'
-                       'To use local spec specify local as the argument.'
-                       'For more details, see the model class, '
-                       '`transformer_block.py`, or `transformer_layer.py`')
-    group.add_argument('--hybrid-layer-pattern', type=str, default=None,
-                       help='Specify a hybrid layer pattern using M (mamba), G (gdn), '
-                       '* (attention), D (dsa), - (mlp), E (moe). Use | to define pipeline '
-                       'stage boundaries for flexible virtual pipeline parallel (fVPP). '
-                       'Use / to separate MTP patterns. '
-                       'Example: "M-M-|M-M*-|M-M-|M-M*-" or "M-M-|M-M*-/MM/MM". '
-                       'When this flag is used, it is the sole indicator that a hybrid model '
-                       'is being run.')
-    group.add_argument('--hybrid-override-pattern', type=str, default=None,
-                       help='Deprecated. Use --hybrid-layer-pattern instead. '
-                       'If specified, its value will be forwarded to --hybrid-layer-pattern.')
-    group.add_argument('--yaml-cfg', type=str, default=None,
-                       help = 'Config file to add additional arguments')
+    group.add_argument(
+        '--enable-experimental', action='store_true', help='Enable experimental features.'
+    )
+    group.add_argument(
+        '--spec',
+        type=str,
+        default=None,
+        nargs='*',
+        help='Specify the <module_location function_name> pair '
+        'that returns a spec to customize a model, transformer '
+        'block, or transformer layer, depending on the use case.'
+        'To use local spec specify local as the argument.'
+        'For more details, see the model class, '
+        '`transformer_block.py`, or `transformer_layer.py`',
+    )
+    group.add_argument(
+        '--hybrid-layer-pattern',
+        type=str,
+        default=None,
+        help='Specify a hybrid layer pattern using M (mamba), G (gdn), '
+        '* (attention), D (dsa), - (mlp), E (moe). Use | to define pipeline '
+        'stage boundaries for flexible virtual pipeline parallel (fVPP). '
+        'Use / to separate MTP patterns. '
+        'Example: "M-M-|M-M*-|M-M-|M-M*-" or "M-M-|M-M*-/MM/MM". '
+        'When this flag is used, it is the sole indicator that a hybrid model '
+        'is being run.',
+    )
+    group.add_argument(
+        '--hybrid-override-pattern',
+        type=str,
+        default=None,
+        help='Deprecated. Use --hybrid-layer-pattern instead. '
+        'If specified, its value will be forwarded to --hybrid-layer-pattern.',
+    )
+    group.add_argument(
+        '--yaml-cfg', type=str, default=None, help='Config file to add additional arguments'
+    )
 
     # Args of precision-aware optimizer.
-    group.add_argument('--use-precision-aware-optimizer', action='store_true',
-                       help='Use the precision-aware optimizer in TransformerEngine, which allows '
-                       'setting the main params and optimizer states to lower precision, such as '
-                       'fp16, bf16 and fp8.')
-    group.add_argument('--main-grads-dtype', default='fp32', choices=['fp32', 'bf16'],
-                       help='Dtype of main grads when enabling precision-aware-optimizer.')
-    group.add_argument('--main-params-dtype', default='fp32', choices=['fp32', 'fp16'],
-                       help='Dtype of main params when enabling precision-aware-optimizer.')
-    group.add_argument('--exp-avg-dtype', default='fp32', choices=['fp32', 'fp16', 'bf16', 'fp8'],
-                       help='Dtype of exp_avg (1st moment in adam optimizer) when enabling '
-                            'precision-aware-optimizer. This dtype is used for storing the '
-                            'optimizer state in memory during training but does not affect '
-                            'the precision in the kernel computation.')
-    group.add_argument('--exp-avg-sq-dtype', default='fp32', choices=['fp32', 'fp16', 'bf16', 'fp8'],
-                       help='Dtype of exp_avg_sq (2nd moment in adam optimizer) when enabling '
-                            'precision-aware-optimizer. This dtype is used for storing the '
-                            'optimizer state in memory during training but does not affect '
-                            'the precision in the kernel computation.')
+    group.add_argument(
+        '--use-precision-aware-optimizer',
+        action='store_true',
+        help='Use the precision-aware optimizer in TransformerEngine, which allows '
+        'setting the main params and optimizer states to lower precision, such as '
+        'fp16, bf16 and fp8.',
+    )
+    group.add_argument(
+        '--main-grads-dtype',
+        default='fp32',
+        choices=['fp32', 'bf16'],
+        help='Dtype of main grads when enabling precision-aware-optimizer.',
+    )
+    group.add_argument(
+        '--main-params-dtype',
+        default='fp32',
+        choices=['fp32', 'fp16'],
+        help='Dtype of main params when enabling precision-aware-optimizer.',
+    )
+    group.add_argument(
+        '--exp-avg-dtype',
+        default='fp32',
+        choices=['fp32', 'fp16', 'bf16', 'fp8'],
+        help='Dtype of exp_avg (1st moment in adam optimizer) when enabling '
+        'precision-aware-optimizer. This dtype is used for storing the '
+        'optimizer state in memory during training but does not affect '
+        'the precision in the kernel computation.',
+    )
+    group.add_argument(
+        '--exp-avg-sq-dtype',
+        default='fp32',
+        choices=['fp32', 'fp16', 'bf16', 'fp8'],
+        help='Dtype of exp_avg_sq (2nd moment in adam optimizer) when enabling '
+        'precision-aware-optimizer. This dtype is used for storing the '
+        'optimizer state in memory during training but does not affect '
+        'the precision in the kernel computation.',
+    )
 
     # Megatron-FSDP Arguments
-    group.add_argument('--megatron-fsdp-main-params-dtype', default='fp32', choices=['fp32', 'bf16', 'fp16', 'auto'],
-                       help="Data type for the main weight buffer utilized for distributed optimization "
-                            "and quantization with Megatron-FSDP. If 'auto', then the native model parameter "
-                            "data-type will be used for the main weight data-type.")
-    group.add_argument('--megatron-fsdp-main-grads-dtype', default='auto', choices=['fp32', 'bf16', 'fp16', 'auto'],
-                       help="Data type for the main gradient buffer utilized for distributed optimization "
-                            "with Megatron-FSDP. If 'auto', then the native model gradient data-type will "
-                            "be used for the main gradient / accumulation data-type.")
-    group.add_argument("--megatron-fsdp-grad-comm-dtype", default='auto', choices=['fp32', 'fp16', 'bf16', 'auto'],
-                        help="When using Megatron-FSDP, this controls the data-type used when communicating "
-                             "model gradients during FSDP. If 'auto', then the main gradient data-type will "
-                             "be used for the gradient communication / reduction data-type. When using NCCL "
-                             "v2.27+, reduction is always computed in FP32 if using NCCL Symmetric kernels.")
-    
+    group.add_argument(
+        '--megatron-fsdp-main-params-dtype',
+        default='fp32',
+        choices=['fp32', 'bf16', 'fp16', 'auto'],
+        help="Data type for the main weight buffer utilized for distributed optimization "
+        "and quantization with Megatron-FSDP. If 'auto', then the native model parameter "
+        "data-type will be used for the main weight data-type.",
+    )
+    group.add_argument(
+        '--megatron-fsdp-main-grads-dtype',
+        default='auto',
+        choices=['fp32', 'bf16', 'fp16', 'auto'],
+        help="Data type for the main gradient buffer utilized for distributed optimization "
+        "with Megatron-FSDP. If 'auto', then the native model gradient data-type will "
+        "be used for the main gradient / accumulation data-type.",
+    )
+    group.add_argument(
+        "--megatron-fsdp-grad-comm-dtype",
+        default='auto',
+        choices=['fp32', 'fp16', 'bf16', 'auto'],
+        help="When using Megatron-FSDP, this controls the data-type used when communicating "
+        "model gradients during FSDP. If 'auto', then the main gradient data-type will "
+        "be used for the gradient communication / reduction data-type. When using NCCL "
+        "v2.27+, reduction is always computed in FP32 if using NCCL Symmetric kernels.",
+    )
+
     return parser
 
 
 def _add_msc_args(parser):
     group = parser.add_argument_group(title="msc")
-    group.add_argument('--disable-msc', default=True, action='store_false', dest='enable_msc',
-                       help='Disable the usage of Multi-Storage Client (MSC) in Megatron Core.')
+    group.add_argument(
+        '--disable-msc',
+        default=True,
+        action='store_false',
+        dest='enable_msc',
+        help='Disable the usage of Multi-Storage Client (MSC) in Megatron Core.',
+    )
     return parser
 
+
 def _add_kitchen_quantization_arguments(parser: argparse.ArgumentParser):
     """Add quant-specific arguments to the main parser
 
     If kitchen isn't available, nothing to do here, return unchanged parser
     """
     try:
-        from megatron.core.extensions.kitchen import KitchenSpecProvider, HAVE_KITCHEN
+        from megatron.core.extensions.kitchen import HAVE_KITCHEN, KitchenSpecProvider
 
     except (ImportError, ModuleNotFoundError):
         HAVE_KITCHEN = False
@@ -3391,9 +4865,22 @@ def _add_kitchen_quantization_arguments(parser: argparse.ArgumentParser):
         )
     return parser
 
+
 def _add_sft_args(parser):
     group = parser.add_argument_group(title='sft')
     group.add_argument('--sft', action="store_true", help='Megatron SFT training')
-    group.add_argument('--sft-tokenizer-prompt-format', type=str, default="nemotron-h-aligned",
-                       help='SFT prompt format.')
+    group.add_argument(
+        '--sft-tokenizer-prompt-format',
+        type=str,
+        default="nemotron-h-aligned",
+        help='SFT prompt format.',
+    )
+    group.add_argument(
+        '--sft-mock-dataset-config-json',
+        type=str,
+        default=None,
+        help='This config provides the necessary information for the mock dataset. You can either specify a CSV file that contains sequence lengths, where each line stores the length of a sequence, for example: {"mode":"file","path":"/path/to/file"}. Alternatively, you can specify a distribution (currently only supporting lognormal distribution) along with the required parameters, for example, {"mode":"distribution","type":"lognormal","min_seq_len":1024,"max_seq_len":2048,"mean_seq_len":1536,"lognormal_sigma":1.1}, where sigma controls the variability of the lognormal distribution. '
+        'If not specified and --mock-data is set, defaults to a lognormal distribution with '
+        'min_seq_len=seq_length//2, max_seq_len=seq_length, mean_seq_len=seq_length*3//4, lognormal_sigma=1.1.',
+    )
     return parser
diff --git a/megatron/training/async_utils.py b/megatron/training/async_utils.py
index 3d283c056fa..c1f75934aa5 100644
--- a/megatron/training/async_utils.py
+++ b/megatron/training/async_utils.py
@@ -91,7 +91,10 @@ def init_persistent_async_worker(rank: int, mp_mode: str = 'spawn'):
     # initialize ckpt write results queue
     get_write_results_queue('fork')
     if rank == 0:
-        print(f"init_persistent_async_worker: rank {rank}, Async Caller Started in {time.time() - time_start} seconds", flush=True)
+        print(
+            f"init_persistent_async_worker: rank {rank}, Async Caller Started in {time.time() - time_start} seconds",
+            flush=True,
+        )
 
 
 def schedule_async_save(async_request: AsyncRequest | NVRxAsyncRequest):
@@ -127,6 +130,7 @@ def maybe_finalize_async_save(blocking: bool = False, terminate=False):
     # Clean up finished deletion processes to prevent zombies
     # Import here to avoid circular dependency
     from .checkpointing import finalize_deletion_processes
+
     finalize_deletion_processes(blocking=blocking or terminate)
 
     if terminate and async_calls_queue is not None:
@@ -144,7 +148,7 @@ def is_empty_async_queue() -> bool:
 
 def reset_persistent_async_worker(async_strategy):
     global _async_calls_queue, _results_queue
-    
+
     if _async_calls_queue is not None:
         _async_calls_queue.close(abort=True)
         del _async_calls_queue
diff --git a/megatron/training/checkpointing.py b/megatron/training/checkpointing.py
index 1441a71518d..ed27e83051e 100644
--- a/megatron/training/checkpointing.py
+++ b/megatron/training/checkpointing.py
@@ -57,6 +57,7 @@
         handle_swiglu_in_state_dict,
         print_diff_in_state_dicts,
     )
+
     HAVE_MEGATRON_FSDP = True
 except ImportError:
     HAVE_MEGATRON_FSDP = False
@@ -67,6 +68,7 @@
     from modelopt.torch.opt.plugins import save_modelopt_state, save_sharded_modelopt_state
 
     from megatron.post_training.utils import print_distributed_quant_summary
+
     has_nvidia_modelopt = True
 except Exception:
     has_nvidia_modelopt = False
@@ -94,6 +96,7 @@
 # Track deletion processes to prevent zombies
 _deletion_processes = []
 
+
 def finalize_deletion_processes(blocking=False):
     """Clean up deletion processes to prevent zombie processes.
 
@@ -111,17 +114,19 @@ def finalize_deletion_processes(blocking=False):
     finished = []
     for proc in _deletion_processes:
         if not proc.is_alive() or blocking:
-            logger.debug(f"Joining deletion process {proc.pid} (blocking={blocking}, is_alive={proc.is_alive()})")
+            logger.debug(
+                f"Joining deletion process {proc.pid} (blocking={blocking}, is_alive={proc.is_alive()})"
+            )
             proc.join()
             finished.append(proc)
     for proc in finished:
         _deletion_processes.remove(proc)
 
+
 def set_checkpoint_version(value):
     global _CHECKPOINT_VERSION
     if _CHECKPOINT_VERSION is not None:
-        assert _CHECKPOINT_VERSION == value, \
-            "checkpoint versions do not match"
+        assert _CHECKPOINT_VERSION == value, "checkpoint versions do not match"
     _CHECKPOINT_VERSION = value
 
 
@@ -161,9 +166,10 @@ def _compare(arg_name, old_arg_name=None, default=None):
         else:
             checkpoint_value = getattr(checkpoint_args, ckpt_arg_name)
         args_value = getattr(args, arg_name)
-        error_message = '{} value from checkpoint ({}) is not equal to the ' \
-                        'input argument value ({}).'.format(
-                            arg_name, checkpoint_value, args_value)
+        error_message = (
+            '{} value from checkpoint ({}) is not equal to the '
+            'input argument value ({}).'.format(arg_name, checkpoint_value, args_value)
+        )
         assert checkpoint_value == args_value, error_message
 
     _compare('num_layers')
@@ -181,8 +187,7 @@ def _compare(arg_name, old_arg_name=None, default=None):
     if args.phase_transition_iterations:
         _compare('global_batch_size')
     if get_checkpoint_version() < 3.0:
-        _compare('tensor_model_parallel_size',
-                 old_arg_name='model_parallel_size')
+        _compare('tensor_model_parallel_size', old_arg_name='model_parallel_size')
     if get_checkpoint_version() >= 3.0 and not args.use_dist_ckpt:
         _compare('tensor_model_parallel_size')
         _compare('pipeline_model_parallel_size')
@@ -206,11 +211,18 @@ def ensure_directory_exists(filename, check_parent=True):
         os.makedirs(dirname, exist_ok=True)
 
 
-def get_checkpoint_name(checkpoints_path, iteration, release=False,
-                        pipeline_parallel=None,
-                        tensor_rank=None, pipeline_rank=None,
-                        expert_parallel=None, expert_rank=None,
-                        return_base_dir=False, basename="model_optim_rng.pt"):
+def get_checkpoint_name(
+    checkpoints_path,
+    iteration,
+    release=False,
+    pipeline_parallel=None,
+    tensor_rank=None,
+    pipeline_rank=None,
+    expert_parallel=None,
+    expert_rank=None,
+    return_base_dir=False,
+    basename="model_optim_rng.pt",
+):
     """Determine the directory name for this rank's checkpoint."""
     if release:
         directory = 'release'
@@ -222,13 +234,13 @@ def get_checkpoint_name(checkpoints_path, iteration, release=False,
 
     # Use both the tensor and pipeline MP rank.
     if pipeline_parallel is None:
-        pipeline_parallel = (mpu.get_pipeline_model_parallel_world_size() > 1)
+        pipeline_parallel = mpu.get_pipeline_model_parallel_world_size() > 1
     if tensor_rank is None:
         tensor_rank = mpu.get_tensor_model_parallel_rank()
     if pipeline_rank is None:
         pipeline_rank = mpu.get_pipeline_model_parallel_rank()
     if expert_parallel is None:
-        expert_parallel = (mpu.get_expert_model_parallel_world_size() > 1)
+        expert_parallel = mpu.get_expert_model_parallel_world_size() > 1
     if expert_rank is None:
         expert_rank = mpu.get_expert_model_parallel_rank()
 
@@ -236,11 +248,11 @@ def get_checkpoint_name(checkpoints_path, iteration, release=False,
     # optimizer, then the optimizer's path must additionally include the
     # data parallel rank.
     if not pipeline_parallel:
-        common_path = os.path.join(checkpoints_path, directory,
-                            f'mp_rank_{tensor_rank:02d}')
+        common_path = os.path.join(checkpoints_path, directory, f'mp_rank_{tensor_rank:02d}')
     else:
-        common_path = os.path.join(checkpoints_path, directory,
-                f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}')
+        common_path = os.path.join(
+            checkpoints_path, directory, f'mp_rank_{tensor_rank:02d}_{pipeline_rank:03d}'
+        )
 
     if expert_parallel:
         common_path = common_path + f'_{expert_rank:03d}'
@@ -266,8 +278,7 @@ def get_load_checkpoint_path_by_args(args, load_arg="load"):
 
 
 def get_distributed_optimizer_checkpoint_name(model_checkpoint_name):
-    return os.path.join(os.path.dirname(model_checkpoint_name),
-                        "distrib_optim.pt")
+    return os.path.join(os.path.dirname(model_checkpoint_name), "distrib_optim.pt")
 
 
 def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
@@ -280,41 +291,65 @@ def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
     """
 
     # Look for checkpoint with no pipelining and no expert parallelism
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=False,
-                                   tensor_rank=0, pipeline_rank=0,
-                                   expert_parallel=False, expert_rank=0)
+    filename = get_checkpoint_name(
+        checkpoints_path,
+        iteration,
+        release,
+        pipeline_parallel=False,
+        tensor_rank=0,
+        pipeline_rank=0,
+        expert_parallel=False,
+        expert_rank=0,
+    )
     if isfile(filename):
         return filename
 
     # Look for checkpoint with no pipelining and expert parallelism
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=False,
-                                   tensor_rank=0, pipeline_rank=0,
-                                   expert_parallel=True, expert_rank=0)
+    filename = get_checkpoint_name(
+        checkpoints_path,
+        iteration,
+        release,
+        pipeline_parallel=False,
+        tensor_rank=0,
+        pipeline_rank=0,
+        expert_parallel=True,
+        expert_rank=0,
+    )
     if isfile(filename):
         return filename
 
     # Look for checkpoint with pipelining and no expert parallelism
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=True,
-                                   tensor_rank=0, pipeline_rank=0,
-                                   expert_parallel=False, expert_rank=0)
+    filename = get_checkpoint_name(
+        checkpoints_path,
+        iteration,
+        release,
+        pipeline_parallel=True,
+        tensor_rank=0,
+        pipeline_rank=0,
+        expert_parallel=False,
+        expert_rank=0,
+    )
     if isfile(filename):
         return filename
 
     # Look for checkpoint with pipelining and expert parallelism
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=True,
-                                   tensor_rank=0, pipeline_rank=0,
-                                   expert_parallel=True, expert_rank=0)
+    filename = get_checkpoint_name(
+        checkpoints_path,
+        iteration,
+        release,
+        pipeline_parallel=True,
+        tensor_rank=0,
+        pipeline_rank=0,
+        expert_parallel=True,
+        expert_rank=0,
+    )
     if isfile(filename):
         return filename
 
     # Look for a distributed checkpoint
-    filename = get_checkpoint_name(checkpoints_path, iteration, release,
-                                   pipeline_parallel=True,
-                                   return_base_dir=True)
+    filename = get_checkpoint_name(
+        checkpoints_path, iteration, release, pipeline_parallel=True, return_base_dir=True
+    )
     if dist_checkpointing.check_is_distributed_checkpoint(filename):
         return filename
 
@@ -322,7 +357,6 @@ def find_checkpoint_rank_0(checkpoints_path, iteration, release=False):
 
 
 def get_checkpoint_tracker_filename(checkpoints_path):
-
     """Tracker file rescords the latest chckpoint during
     training to restart from."""
     return os.path.join(checkpoints_path, 'latest_checkpointed_iteration.txt')
@@ -348,14 +382,12 @@ def read_metadata(tracker_filename):
         except ValueError:
             release = metastring == 'release'
             if not release:
-                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(
-                    tracker_filename))
+                print_rank_0('ERROR: Invalid metadata file {}. Exiting'.format(tracker_filename))
                 sys.exit()
             else:
                 # Set iteration to 0 for release checkpoints
                 iteration = 0
-    assert iteration > -1 or release, 'error parsing metadata file {}'.format(
-        tracker_filename)
+    assert iteration > -1 or release, 'error parsing metadata file {}'.format(tracker_filename)
 
     # Get the max iteration retrieved across the ranks.
     if torch.distributed.is_initialized():
@@ -368,10 +400,12 @@ def read_metadata(tracker_filename):
         # iteration across all ranks.
         if iteration != max_iter:
             rank = torch.distributed.get_rank()
-            print('WARNING: on rank {} found iteration {} in the '
-                  'metadata while max iteration across the ranks '
-                  'is {}, replacing it with max iteration.'.format(
-                      rank, iteration, max_iter), flush=True)
+            print(
+                'WARNING: on rank {} found iteration {} in the '
+                'metadata while max iteration across the ranks '
+                'is {}, replacing it with max iteration.'.format(rank, iteration, max_iter),
+                flush=True,
+            )
     else:
         # When loading a checkpoint outside of training (for example,
         # when editing it), we might not have torch distributed
@@ -380,7 +414,11 @@ def read_metadata(tracker_filename):
     return max_iter, release
 
 
-def get_rng_state(ckpt_format: str, tp_group: torch.distributed.ProcessGroup, pp_group: torch.distributed.ProcessGroup) -> Union[List[Dict[str, Any]], ShardedObject]:
+def get_rng_state(
+    ckpt_format: str,
+    tp_group: torch.distributed.ProcessGroup,
+    pp_group: torch.distributed.ProcessGroup,
+) -> Union[List[Dict[str, Any]], ShardedObject]:
     """Collect rng state across data parallel ranks."""
     args = get_args()
     rng_state = {
@@ -388,17 +426,19 @@ def get_rng_state(ckpt_format: str, tp_group: torch.distributed.ProcessGroup, pp
         'np_rng_state': np.random.get_state(),
         'torch_rng_state': torch.get_rng_state(),
         'cuda_rng_state': torch.cuda.get_rng_state(),
-        'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states()}
+        'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states(),
+    }
 
     rng_state_list = None
-    if args.data_parallel_random_init and torch.distributed.is_initialized() and \
-            mpu.get_data_parallel_world_size() > 1:
-        rng_state_list = \
-            [None for i in range(mpu.get_data_parallel_world_size())]
+    if (
+        args.data_parallel_random_init
+        and torch.distributed.is_initialized()
+        and mpu.get_data_parallel_world_size() > 1
+    ):
+        rng_state_list = [None for i in range(mpu.get_data_parallel_world_size())]
         torch.distributed.all_gather_object(
-            rng_state_list,
-            rng_state,
-            group=mpu.get_data_parallel_group())
+            rng_state_list, rng_state, group=mpu.get_data_parallel_group()
+        )
     else:
         rng_state_list = [rng_state]
 
@@ -407,17 +447,21 @@ def get_rng_state(ckpt_format: str, tp_group: torch.distributed.ProcessGroup, pp
         pp_size = get_pg_size(pp_group)
         tp_rank = get_pg_rank(tp_group)
         tp_size = get_pg_size(tp_group)
-        rng_state_list = ShardedObject('rng_state', rng_state_list, (pp_size, tp_size), (pp_rank, tp_rank),
-                                       replica_id=mpu.get_data_parallel_rank(with_context_parallel=True))
+        rng_state_list = ShardedObject(
+            'rng_state',
+            rng_state_list,
+            (pp_size, tp_size),
+            (pp_rank, tp_rank),
+            replica_id=mpu.get_data_parallel_rank(with_context_parallel=True),
+        )
     elif ckpt_format == "fsdp_dtensor":
         pp_rank = mpu.get_pipeline_model_parallel_rank()
         tp_rank = mpu.get_tensor_model_parallel_rank()
-        rng_state_list = {
-            f"({pp_rank}, {tp_rank})": rng_state_list
-        }
+        rng_state_list = {f"({pp_rank}, {tp_rank})": rng_state_list}
 
     return rng_state_list
 
+
 class CheckpointType(Enum):
     LEGACY = auto()
     LOCAL = auto()
@@ -426,7 +470,9 @@ class CheckpointType(Enum):
     FSDP_DTENSOR = auto()
 
 
-def _build_sharded_state_dict_metadata(args: Namespace, dp_cp_group: Optional[torch.distributed.ProcessGroup] = None) -> dict:
+def _build_sharded_state_dict_metadata(
+    args: Namespace, dp_cp_group: Optional[torch.distributed.ProcessGroup] = None
+) -> dict:
     """Builds metadata used for sharded_state_dict versioning.
 
     The whole content metadata is passed to ``shared_state_dict`` model and optimizer methods
@@ -449,7 +495,9 @@ def _build_sharded_state_dict_metadata(args: Namespace, dp_cp_group: Optional[to
     if args.use_distributed_optimizer and args.ckpt_format != "fsdp_dtensor":
         if args.dist_ckpt_optim_fully_reshardable:
             metadata['distrib_optim_sharding_type'] = 'fully_reshardable'
-            metadata['distrib_optim_fully_reshardable_mem_efficient'] = args.distrib_optim_fully_reshardable_mem_efficient
+            metadata['distrib_optim_fully_reshardable_mem_efficient'] = (
+                args.distrib_optim_fully_reshardable_mem_efficient
+            )
         else:
             metadata['distrib_optim_sharding_type'] = 'dp_reshardable'
 
@@ -469,8 +517,10 @@ def save_grads(save_dir, state_dict, iteration, grad_label):
     NOTE: wgrads for non-expert layers will be duplicated if using expert parallelism, but
     this can be handled in postprocessing."""
 
-    print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saving {grad_label} "
-                 f"from iteration {iteration:7d}")
+    print_rank_0(
+        f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saving {grad_label} "
+        f"from iteration {iteration:7d}"
+    )
 
     if mpu.get_expert_data_parallel_rank() == 0:
         # Create saving directory.
@@ -492,13 +542,32 @@ def save_grads(save_dir, state_dict, iteration, grad_label):
         # Convert back to dict (e.g., from collections.defaultdict) for easy loading later.
         torch.save(dict(state_dict), full_save_path)
 
-    print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saved {grad_label} "
-                 f"from iteration {iteration:7d}")
+    print_rank_0(
+        f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saved {grad_label} "
+        f"from iteration {iteration:7d}"
+    )
 
 
-def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floating_point_operations_so_far,
-                    checkpointing_context=None, pipeline_rank=None, expert_rank=None, tensor_rank=None, pipeline_parallel=None, expert_parallel=None, non_persistent_ckpt=False,
-                    train_data_iterator=None, preprocess_common_state_dict_fn = None, release=False, tp_group: Optional[torch.distributed.ProcessGroup] = None, pp_group: Optional[torch.distributed.ProcessGroup] = None, dp_cp_group: Optional[torch.distributed.ProcessGroup] = None):
+def save_checkpoint(
+    iteration,
+    model,
+    optimizer,
+    opt_param_scheduler,
+    num_floating_point_operations_so_far,
+    checkpointing_context=None,
+    pipeline_rank=None,
+    expert_rank=None,
+    tensor_rank=None,
+    pipeline_parallel=None,
+    expert_parallel=None,
+    non_persistent_ckpt=False,
+    train_data_iterator=None,
+    preprocess_common_state_dict_fn=None,
+    release=False,
+    tp_group: Optional[torch.distributed.ProcessGroup] = None,
+    pp_group: Optional[torch.distributed.ProcessGroup] = None,
+    dp_cp_group: Optional[torch.distributed.ProcessGroup] = None,
+):
     """Save a model, optimizer and optionally dataloader checkpoint.
 
     Checkpointing context is used to persist some checkpointing state
@@ -520,7 +589,9 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
     args = get_args()
 
     if args.async_save and not is_empty_async_queue():
-        print_rank_0('WARNING: Starting a checkpoint save before previous has finished. Consider increasing the checkpoint interval.')
+        print_rank_0(
+            'WARNING: Starting a checkpoint save before previous has finished. Consider increasing the checkpoint interval.'
+        )
 
     # Prepare E2E metrics at start of save checkpoint
     productive_metrics = on_save_checkpoint_start(args.async_save)
@@ -551,11 +622,15 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
             ckpt_type = CheckpointType.LOCAL
             save_dir = checkpointing_context['local_checkpoint_manager'].local_ckpt_dir
         else:
-            raise NotImplementedError(f"Please use local or global non-persistent checkpoints (got: {args.non_persistent_ckpt_type})")
+            raise NotImplementedError(
+                f"Please use local or global non-persistent checkpoints (got: {args.non_persistent_ckpt_type})"
+            )
 
     ckpt_format = args.ckpt_format if ckpt_type == CheckpointType.GLOBAL else 'torch'
-    print_rank_0(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saving checkpoint "
-                 f"at iteration {iteration:7d} to {save_dir} in {ckpt_format} format")
+    print_rank_0(
+        f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] saving checkpoint "
+        f"at iteration {iteration:7d} to {save_dir} in {ckpt_format} format"
+    )
 
     # Collect rng state across data parallel ranks.
     if tp_group is None and pp_group is None:
@@ -566,16 +641,27 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
     # Collect rerun state across all ranks
     rerun_state_machine = get_rerun_state_machine()
     rerun_state = rerun_state_machine.state_dict(
-        data_iterator=train_data_iterator, ckpt_format=args.ckpt_format,
+        data_iterator=train_data_iterator, ckpt_format=args.ckpt_format
     )
 
     # Checkpoint name.
-    return_base_dir = (ckpt_type != CheckpointType.LEGACY)
-    checkpoint_name = get_checkpoint_name(save_dir, iteration, release=release, pipeline_parallel=pipeline_parallel,
-        tensor_rank=tensor_rank, pipeline_rank=pipeline_rank, expert_parallel=expert_parallel, expert_rank=expert_rank, return_base_dir=return_base_dir)
+    return_base_dir = ckpt_type != CheckpointType.LEGACY
+    checkpoint_name = get_checkpoint_name(
+        save_dir,
+        iteration,
+        release=release,
+        pipeline_parallel=pipeline_parallel,
+        tensor_rank=tensor_rank,
+        pipeline_rank=pipeline_rank,
+        expert_parallel=expert_parallel,
+        expert_rank=expert_rank,
+        return_base_dir=return_base_dir,
+    )
 
     # Save dataloader state if the dataloader supports it (currently only Megatron Energon).
-    maybe_save_dataloader_state(train_data_iterator, iteration, getattr(args, "dataloader_save", None))
+    maybe_save_dataloader_state(
+        train_data_iterator, iteration, getattr(args, "dataloader_save", None)
+    )
 
     # Save distributed optimizer's custom parameter state.
     if (
@@ -584,8 +670,7 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
         and optimizer is not None
         and ckpt_type == CheckpointType.LEGACY
     ):
-        optim_checkpoint_name = \
-            get_distributed_optimizer_checkpoint_name(checkpoint_name)
+        optim_checkpoint_name = get_distributed_optimizer_checkpoint_name(checkpoint_name)
         ensure_directory_exists(optim_checkpoint_name)
         if not optimizer.is_stub_optimizer:
             optimizer.save_parameter_state(optim_checkpoint_name)
@@ -593,7 +678,9 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
     # LayerWiseDistributedOptimizer save optimizer state to file on different ranks
     if getattr(args, "use_layer_wise_distributed_optimizer", False) and args.ckpt_format == 'torch':
         dp_rank = mpu.get_data_parallel_rank()
-        optim_checkpoint_name = os.path.join(os.path.dirname(checkpoint_name), f"layer_wise_optimizer_{dp_rank}.pt")
+        optim_checkpoint_name = os.path.join(
+            os.path.dirname(checkpoint_name), f"layer_wise_optimizer_{dp_rank}.pt"
+        )
         ensure_directory_exists(optim_checkpoint_name)
         if not optimizer.is_stub_optimizer:
             optimizer.save_state_dict_to_file(optim_checkpoint_name)
@@ -601,21 +688,33 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
     async_save_request = None
     if args.async_save:
         if ckpt_type == CheckpointType.LEGACY:
-            raise NotImplementedError('Async checkpoint save not implemented for legacy checkpoints')
-        elif ckpt_type == CheckpointType.GLOBAL and args.ckpt_format not in ['torch_dist', 'torch_dcp', 'fsdp_dtensor']:
-            raise NotImplementedError(f'Async checkpoint save not implemented for {args.ckpt_format} distributed checkpoint format')
+            raise NotImplementedError(
+                'Async checkpoint save not implemented for legacy checkpoints'
+            )
+        elif ckpt_type == CheckpointType.GLOBAL and args.ckpt_format not in [
+            'torch_dist',
+            'torch_dcp',
+            'fsdp_dtensor',
+        ]:
+            raise NotImplementedError(
+                f'Async checkpoint save not implemented for {args.ckpt_format} distributed checkpoint format'
+            )
 
     rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
 
     # Collect args, model, RNG.
-    if not torch.distributed.is_initialized() \
-            or mpu.get_expert_data_parallel_rank() == 0 \
-            or ckpt_type != CheckpointType.LEGACY:
+    if (
+        not torch.distributed.is_initialized()
+        or mpu.get_expert_data_parallel_rank() == 0
+        or ckpt_type != CheckpointType.LEGACY
+    ):
         if ckpt_type != CheckpointType.LEGACY:
             sharded_sd_metadata = _build_sharded_state_dict_metadata(args, dp_cp_group=dp_cp_group)
             if args.use_distributed_optimizer:
-                print_rank_0(f'Storing distributed optimizer sharded state of type'
-                             f' {sharded_sd_metadata["distrib_optim_sharding_type"]}')
+                print_rank_0(
+                    f'Storing distributed optimizer sharded state of type'
+                    f' {sharded_sd_metadata["distrib_optim_sharding_type"]}'
+                )
         else:
             sharded_sd_metadata = None
         state_dict = generate_state_dict(
@@ -650,35 +749,51 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
                         save_strategy.thread_count = args.dist_ckpt_workers
                     else:
                         # We don't allow per-rank parallel save for sync save
-                        logger.warning('Per-rank parallel save is not supported for sync save. '
-                                       'Setting args.dist_ckpt_workers to 1')
+                        logger.warning(
+                            'Per-rank parallel save is not supported for sync save. '
+                            'Setting args.dist_ckpt_workers to 1'
+                        )
                         save_strategy.thread_count = 1
-                    if checkpointing_context is not None and 'load_strategy' in checkpointing_context:
-                        cached_global_metadata = getattr(checkpointing_context['load_strategy'], 'cached_global_metadata', None)
+                    if (
+                        checkpointing_context is not None
+                        and 'load_strategy' in checkpointing_context
+                    ):
+                        cached_global_metadata = getattr(
+                            checkpointing_context['load_strategy'], 'cached_global_metadata', None
+                        )
                         if cached_global_metadata is not None:
                             logger.debug("Plugging in the read metadata from the load strategy...")
                             save_strategy.cached_global_metadata = cached_global_metadata
                         else:
-                            logger.debug("Failed to plug in the read metadata from the load strategy...")
+                            logger.debug(
+                                "Failed to plug in the read metadata from the load strategy..."
+                            )
 
                 if args.ckpt_fully_parallel_save:
                     if args.ckpt_fully_parallel_save_process_group == 'dp':
                         process_group = mpu.get_data_parallel_group(with_context_parallel=True)
                     elif args.ckpt_fully_parallel_save_process_group == 'ep_dp':
                         process_group = mpu.get_expert_data_parallel_group()
-                    save_strategy = FullyParallelSaveStrategyWrapper(save_strategy, process_group,
-                                                                     args.ckpt_assume_constant_structure)
+                    save_strategy = FullyParallelSaveStrategyWrapper(
+                        save_strategy, process_group, args.ckpt_assume_constant_structure
+                    )
             # Store save strategy for future checkpoint saves
             if checkpointing_context is not None:
                 checkpointing_context['save_strategy'] = save_strategy
             end_ckpt = time()
-            logger.debug(f"rank: {rank}, takes {end_ckpt - start_ckpt} to prepare state dict for ckpt ")
-            async_save_request = dist_checkpointing.save(state_dict, checkpoint_name, save_strategy,
-                                                         async_sharded_save=args.async_save,
-                                                         validate_access_integrity=validate_sharding_integrity,
-                                                         preprocess_common_before_consistancy_check=preprocess_common_state_dict_fn,
-                                                         content_metadata=_clean_metadata_for_serialization(sharded_sd_metadata),
-                                                         async_strategy=args.async_strategy)
+            logger.debug(
+                f"rank: {rank}, takes {end_ckpt - start_ckpt} to prepare state dict for ckpt "
+            )
+            async_save_request = dist_checkpointing.save(
+                state_dict,
+                checkpoint_name,
+                save_strategy,
+                async_sharded_save=args.async_save,
+                validate_access_integrity=validate_sharding_integrity,
+                preprocess_common_before_consistancy_check=preprocess_common_state_dict_fn,
+                content_metadata=_clean_metadata_for_serialization(sharded_sd_metadata),
+                async_strategy=args.async_strategy,
+            )
             # [ModelOpt]: save sharded modelopt_state
             if has_nvidia_modelopt:
                 save_sharded_modelopt_state(model, checkpoint_name, (args.ckpt_format, 1))
@@ -715,14 +830,20 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
                 )
 
                 save_state_dict_ret = save_state_dict_async_plan(
-                    state_dict, fs_storage_writer, None, coordinator_rank, planner=planner, enable_cache=args.ckpt_assume_constant_structure
+                    state_dict,
+                    fs_storage_writer,
+                    None,
+                    coordinator_rank,
+                    planner=planner,
+                    enable_cache=args.ckpt_assume_constant_structure,
+                )
+                async_save_request = get_save_and_finalize_callbacks(
+                    fs_storage_writer, save_state_dict_ret
                 )
-                async_save_request = get_save_and_finalize_callbacks(fs_storage_writer, save_state_dict_ret)
             else:
                 fs_storage_writer = torch.distributed.checkpoint.FileSystemWriter(checkpoint_name)
                 torch.distributed.checkpoint.save(
-                    state_dict=state_dict,
-                    storage_writer=fs_storage_writer,
+                    state_dict=state_dict, storage_writer=fs_storage_writer
                 )
         else:
             # [ModelOpt]: Inject modelopt_state into state_dict
@@ -733,16 +854,23 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
                     save_modelopt_state(model, state_dict)
 
             end_ckpt = time()
-            logger.debug(f"rank: {rank}, takes {end_ckpt - start_ckpt} to prepare state dict for ckpt ")
+            logger.debug(
+                f"rank: {rank}, takes {end_ckpt - start_ckpt} to prepare state dict for ckpt "
+            )
             if ckpt_type == CheckpointType.LOCAL:
                 try:
                     from megatron.core.dist_checkpointing.tensor_aware_state_dict import (
                         MCoreTensorAwareStateDict,
                     )
                 except ModuleNotFoundError:
-                    raise RuntimeError("The 'nvidia_resiliency_ext' module is required for local "
-                                       "checkpointing but was not found. Please ensure it is installed.")
-                if (sharded_sd_metadata or {}).get('distrib_optim_sharding_type') in ['fully_reshardable', 'dp_zero_gather_scatter']:
+                    raise RuntimeError(
+                        "The 'nvidia_resiliency_ext' module is required for local "
+                        "checkpointing but was not found. Please ensure it is installed."
+                    )
+                if (sharded_sd_metadata or {}).get('distrib_optim_sharding_type') in [
+                    'fully_reshardable',
+                    'dp_zero_gather_scatter',
+                ]:
                     # Note: Currently full reshardabilty is not supported when local checkpoints are used.
                     raise RuntimeError(
                         f"Local checkpointing does not support optimizer sharding type '{sharded_sd_metadata['distrib_optim_sharding_type']}'. "
@@ -750,11 +878,16 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
                     )
                 algo = args.non_persistent_local_ckpt_algo
                 cached_metadata = None
-                if args.ckpt_assume_constant_structure and 'local_checkpoint_cache' in checkpointing_context:
+                if (
+                    args.ckpt_assume_constant_structure
+                    and 'local_checkpoint_cache' in checkpointing_context
+                ):
                     cached_metadata = checkpointing_context['local_checkpoint_cache']
                 state_dict_for_save, cacheable_metadata = MCoreTensorAwareStateDict.from_state_dict(
-                    state_dict, algo=algo, cached_metadata=cached_metadata,
-                    parallelization_group=mpu.get_data_parallel_group(with_context_parallel=True)
+                    state_dict,
+                    algo=algo,
+                    cached_metadata=cached_metadata,
+                    parallelization_group=mpu.get_data_parallel_group(with_context_parallel=True),
                 )
                 async_save_request = checkpointing_context['local_checkpoint_manager'].save(
                     state_dict_for_save, iteration, is_async=bool(args.async_save)
@@ -774,45 +907,72 @@ def save_checkpoint(iteration, model, optimizer, opt_param_scheduler, num_floati
                 torch.distributed.barrier()
 
     # And update the latest iteration
-    if not torch.distributed.is_initialized() \
-            or torch.distributed.get_rank() == 0:
+    if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0:
         tracker_filename = get_checkpoint_tracker_filename(save_dir)
 
         if ckpt_type == CheckpointType.LOCAL:
+
             def iter_finalize_fn():
-                print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] successfully "
-                             f"saved local checkpoint from iteration {iteration:7d}")
+                print_rank_0(
+                    f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] successfully "
+                    f"saved local checkpoint from iteration {iteration:7d}"
+                )
                 if args.log_progress and args.async_save:
-                    append_to_progress_log(f'Saved async local checkpoint\tIteration: {iteration}',
-                                           barrier=False)
+                    append_to_progress_log(
+                        f'Saved async local checkpoint\tIteration: {iteration}', barrier=False
+                    )
+
         else:
+
             def iter_finalize_fn():
                 prev_iteration = 0
-                save_retain_interval = getattr(args, 'save_retain_interval', None)  # For backwards compatibility of tests.
+                save_retain_interval = getattr(
+                    args, 'save_retain_interval', None
+                )  # For backwards compatibility of tests.
                 if save_retain_interval is not None:
-                    if os.path.exists(tracker_filename):  # TODO: Make this work with MSC remote paths?
+                    if os.path.exists(
+                        tracker_filename
+                    ):  # TODO: Make this work with MSC remote paths?
                         with open_file(tracker_filename, 'r') as f:
                             prev_iteration = int(f.read().strip())
                 with open_file(tracker_filename, 'w') as f:
                     f.write("release" if release else str(iteration))
-                tensor_rank_to_print = (tensor_rank if tensor_rank is not None else mpu.get_tensor_model_parallel_rank()) + 1
-                pipeline_rank_to_print = (pipeline_rank if pipeline_rank is not None else mpu.get_pipeline_model_parallel_rank()) + 1
-                print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] successfully saved "
-                             f"checkpoint from iteration {int(iteration):7d} to {args.save} "
-                             f"[ t {tensor_rank_to_print}/{mpu.get_tensor_model_parallel_world_size()}, "
-                             f"p {pipeline_rank_to_print}/{mpu.get_pipeline_model_parallel_world_size()} ]")
+                tensor_rank_to_print = (
+                    tensor_rank if tensor_rank is not None else mpu.get_tensor_model_parallel_rank()
+                ) + 1
+                pipeline_rank_to_print = (
+                    pipeline_rank
+                    if pipeline_rank is not None
+                    else mpu.get_pipeline_model_parallel_rank()
+                ) + 1
+                print_rank_0(
+                    f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] successfully saved "
+                    f"checkpoint from iteration {int(iteration):7d} to {args.save} "
+                    f"[ t {tensor_rank_to_print}/{mpu.get_tensor_model_parallel_world_size()}, "
+                    f"p {pipeline_rank_to_print}/{mpu.get_pipeline_model_parallel_world_size()} ]"
+                )
                 if args.log_progress and args.async_save:
-                    append_to_progress_log(f'Saved async checkpoint\tIteration: {iteration}',
-                                           barrier=False)
+                    append_to_progress_log(
+                        f'Saved async checkpoint\tIteration: {iteration}', barrier=False
+                    )
 
                 if save_retain_interval is not None:
-                    if prev_iteration > 0 and prev_iteration != iteration and prev_iteration % save_retain_interval != 0:
-                        checkpoint_name = get_checkpoint_name(args.save, iteration=prev_iteration,
-                                                              return_base_dir=True)
+                    if (
+                        prev_iteration > 0
+                        and prev_iteration != iteration
+                        and prev_iteration % save_retain_interval != 0
+                    ):
+                        checkpoint_name = get_checkpoint_name(
+                            args.save, iteration=prev_iteration, return_base_dir=True
+                        )
                         # Don't delete if `checkpoint_name` is a symbolic link.
-                        if os.path.islink(checkpoint_name):  # TODO: Make this work with MSC remote paths?
-                            print_rank_0(f'  skipping deleting checkpoint from iteration {prev_iteration:7d} '
-                                         f'at {args.save} since it is a symbolic link')
+                        if os.path.islink(
+                            checkpoint_name
+                        ):  # TODO: Make this work with MSC remote paths?
+                            print_rank_0(
+                                f'  skipping deleting checkpoint from iteration {prev_iteration:7d} '
+                                f'at {args.save} since it is a symbolic link'
+                            )
                         else:
                             # Asynchronous version of delete_checkpoint(args, iteration_to_delete=prev_iteration).
                             # Use multiprocessing to delete checkpoint in background
@@ -822,15 +982,24 @@ def iter_finalize_fn():
                                 ctx = multiprocessing.get_context('fork')
                                 delete_process = ctx.Process(
                                     target=_async_delete_checkpoint_impl,
-                                    args=(args.save, prev_iteration, args.log_progress, True,
-                                          args.async_ckpt_cpu_priority, args.async_ckpt_io_priority),
-                                    daemon=True
+                                    args=(
+                                        args.save,
+                                        prev_iteration,
+                                        args.log_progress,
+                                        True,
+                                        args.async_ckpt_cpu_priority,
+                                        args.async_ckpt_io_priority,
+                                    ),
+                                    daemon=True,
                                 )
                                 delete_process.start()
                                 # Track the process so we can join it later to prevent zombies
                                 _deletion_processes.append(delete_process)
                             else:
-                                th = threading.Thread(target=_async_delete_checkpoint_impl, args=(args.save, prev_iteration, args.log_progress))
+                                th = threading.Thread(
+                                    target=_async_delete_checkpoint_impl,
+                                    args=(args.save, prev_iteration, args.log_progress),
+                                )
                                 th.start()
 
         if args.async_save:
@@ -840,10 +1009,11 @@ def iter_finalize_fn():
             iter_finalize_fn()
 
     # Additional callback for one_logger (last rank)
-    if not torch.distributed.is_initialized() \
-       or is_last_rank():
+    if not torch.distributed.is_initialized() or is_last_rank():
+
         def onelogger_finalize_fn():
             on_save_checkpoint_success(productive_metrics, args.async_save)
+
         if args.async_save:
             assert async_save_request is not None
             async_save_request.add_finalize_fn(onelogger_finalize_fn)
@@ -851,10 +1021,13 @@ def onelogger_finalize_fn():
             onelogger_finalize_fn()
 
     # Additional callback for wandb (last rank)
-    if not torch.distributed.is_initialized() \
-       or is_last_rank():
+    if not torch.distributed.is_initialized() or is_last_rank():
+
         def wandb_finalize_fn():
-            wandb_utils.on_save_checkpoint_success(checkpoint_name, get_checkpoint_tracker_filename(save_dir), save_dir, iteration)
+            wandb_utils.on_save_checkpoint_success(
+                checkpoint_name, get_checkpoint_tracker_filename(save_dir), save_dir, iteration
+            )
+
         if args.async_save:
             assert async_save_request is not None
             async_save_request.add_finalize_fn(wandb_finalize_fn)
@@ -863,8 +1036,10 @@ def wandb_finalize_fn():
 
     if args.async_save:
         schedule_async_save(async_save_request)
-        print_rank_0(f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] scheduled "
-                     f"an async checkpoint save at iteration {iteration:7d} to {save_dir}")
+        print_rank_0(
+            f"  [{datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')}] scheduled "
+            f"an async checkpoint save at iteration {iteration:7d} to {save_dir}"
+        )
 
     # Wait so everyone is done (not necessary)
     if torch.distributed.is_initialized():
@@ -875,15 +1050,22 @@ def wandb_finalize_fn():
 
     ft_integration.on_checkpointing_end(is_async_finalization=False)
 
+
 @_disable_gc()
-def _async_delete_checkpoint_impl(save_path, iteration_to_delete, log_progress=False, lower_priority=False,
-                                  cpu_priority=None, io_priority=None):
+def _async_delete_checkpoint_impl(
+    save_path,
+    iteration_to_delete,
+    log_progress=False,
+    lower_priority=False,
+    cpu_priority=None,
+    io_priority=None,
+):
     """Module-level function for async checkpoint deletion.
-    
+
     This function can be pickled and executed by the async worker process.
     Note: This is only called from rank 0, so we use regular print() instead of print_rank_0()
     since torch.distributed won't be initialized in the async worker process.
-    
+
     Args:
         save_path (str): Path to the checkpoints directory
         iteration_to_delete (int): Iteration number of checkpoint to delete
@@ -894,19 +1076,29 @@ def _async_delete_checkpoint_impl(save_path, iteration_to_delete, log_progress=F
     """
     if lower_priority:
         from megatron.core.dist_checkpointing.strategies.async_utils import _set_process_qos
+
         _set_process_qos(cpu_priority=cpu_priority, io_priority=io_priority)
 
-    checkpoint_name = get_checkpoint_name(save_path, iteration=iteration_to_delete,
-                                         return_base_dir=True)
+    checkpoint_name = get_checkpoint_name(
+        save_path, iteration=iteration_to_delete, return_base_dir=True
+    )
     try:
         shutil.rmtree(checkpoint_name)  # TODO: Make this work with MSC remote paths?
-        print(f'  successfully deleted checkpoint from iteration {iteration_to_delete:7d} '
-              f'at {save_path}', flush=True)
+        print(
+            f'  successfully deleted checkpoint from iteration {iteration_to_delete:7d} '
+            f'at {save_path}',
+            flush=True,
+        )
         if log_progress:
-            append_to_progress_log(f'Deleted checkpoint\tIteration: {iteration_to_delete}', barrier=False)
+            append_to_progress_log(
+                f'Deleted checkpoint\tIteration: {iteration_to_delete}', barrier=False
+            )
     except Exception as e:
-        print(f'  encountered exception "{e}" when trying to delete checkpoint from '
-              f'iteration {iteration_to_delete:7d} at {save_path}', flush=True)
+        print(
+            f'  encountered exception "{e}" when trying to delete checkpoint from '
+            f'iteration {iteration_to_delete:7d} at {save_path}',
+            flush=True,
+        )
         # Any exception encountered in checkpoint deletion can be ignored and is not fatal.
         pass
 
@@ -918,7 +1110,9 @@ def cleanup_old_non_persistent_checkpoint(save_dir, leave_ckpt_num=1, do_async=F
 
     iter_prefix = "iter_"
     iter_ckpts = save_dir.rglob(f'{iter_prefix}*')
-    sorted_iter_ckpts = sorted(iter_ckpts, key=lambda ckpt_name: int(ckpt_name.name[len(iter_prefix):]))
+    sorted_iter_ckpts = sorted(
+        iter_ckpts, key=lambda ckpt_name: int(ckpt_name.name[len(iter_prefix) :])
+    )
     if not sorted_iter_ckpts:
         return
     rm_iter_ckpts = sorted_iter_ckpts[:-leave_ckpt_num]
@@ -928,6 +1122,7 @@ def cleanup_old_non_persistent_checkpoint(save_dir, leave_ckpt_num=1, do_async=F
     def remove_iter_ckpts(_iter_ckpts):
         for ckpt in _iter_ckpts:
             shutil.rmtree(ckpt)
+
     if do_async:
         threading.Thread(target=remove_iter_ckpts, args=(rm_iter_ckpts,)).start()
     else:
@@ -955,10 +1150,15 @@ def maybe_save_dataloader_state(train_iterator, iteration, dataloader_save_path)
 
     # If dataloader doesn't support saving state, raise an error.
     if not hasattr(train_iterator.iterable, "save_state"):
-        raise RuntimeError(f"Could not find a save_state for the train_iterator of type {type(train_iterator)}")
+        raise RuntimeError(
+            f"Could not find a save_state for the train_iterator of type {type(train_iterator)}"
+        )
 
     # Save dataloader state for each data parallel rank only once.
-    first_rank = mpu.is_pipeline_first_stage(ignore_virtual=True) and mpu.get_tensor_model_parallel_rank() == 0
+    first_rank = (
+        mpu.is_pipeline_first_stage(ignore_virtual=True)
+        and mpu.get_tensor_model_parallel_rank() == 0
+    )
     if not first_rank:
         return
 
@@ -967,8 +1167,7 @@ def maybe_save_dataloader_state(train_iterator, iteration, dataloader_save_path)
         print(f"saving dataloader checkpoint at iteration {iteration} to {dataloader_save_path}")
     train_dataloader_state_dict = train_iterator.iterable.save_state()
     data_state_save_path = get_checkpoint_name(
-        dataloader_save_path, iteration,
-        basename=f'train_dataloader_dprank{dp_rank:03d}.pt'
+        dataloader_save_path, iteration, basename=f'train_dataloader_dprank{dp_rank:03d}.pt'
     )
 
     torch.distributed.barrier(group=mpu.get_data_parallel_group())
@@ -994,7 +1193,7 @@ def generate_state_dict(
     model_sd_kwargs=None,
     rerun_state=None,
 ):
-    """Generate a state dict from given model, optimizer, scheduler, rng state and others. """
+    """Generate a state dict from given model, optimizer, scheduler, rng state and others."""
 
     # Arguments, iteration, and model.
     state_dict = {}
@@ -1010,13 +1209,16 @@ def generate_state_dict(
 
         if args.ckpt_format == "torch_dist":
             model_sd = model[i].sharded_state_dict(
-                **(model_sd_kwargs or {
-                    "metadata": {
-                        "dp_cp_group": mpu.get_data_parallel_group(with_context_parallel=True)
+                **(
+                    model_sd_kwargs
+                    or {
+                        "metadata": {
+                            "dp_cp_group": mpu.get_data_parallel_group(with_context_parallel=True)
+                        }
                     }
-                })
+                )
             )
-        else:   # torch, torch_dcp, fsdp_dtensor
+        else:  # torch, torch_dcp, fsdp_dtensor
             model_sd = model[i].state_dict_for_save_checkpoint()
 
         state_dict[key] = model_sd
@@ -1028,11 +1230,16 @@ def generate_state_dict(
             if args.ckpt_format == "torch_dist":
                 optimizer_sd = optimizer.sharded_state_dict(
                     state_dict,
-                    **(optim_sd_kwargs or {
-                        "metadata": {
-                            "dp_cp_group": mpu.get_data_parallel_group(with_context_parallel=True)
+                    **(
+                        optim_sd_kwargs
+                        or {
+                            "metadata": {
+                                "dp_cp_group": mpu.get_data_parallel_group(
+                                    with_context_parallel=True
+                                )
+                            }
                         }
-                    })
+                    ),
                 )
             elif args.ckpt_format == "fsdp_dtensor":
                 if optim_sd_kwargs is None:
@@ -1047,8 +1254,7 @@ def generate_state_dict(
             state_dict['optimizer'] = optimizer_sd
 
         if opt_param_scheduler is not None:
-            state_dict['opt_param_scheduler'] = \
-                opt_param_scheduler.state_dict()
+            state_dict['opt_param_scheduler'] = opt_param_scheduler.state_dict()
 
     # Rerun state
     if rerun_state:
@@ -1072,9 +1278,7 @@ def preprocess_fsdp_dtensor_state_dict(args, raw_state_dict, model):
             state_dict["model"] = model_state_dict
             state_dict["optimizer"] = optimizer_state_dict
         else:
-            model_state_dict, _ = handle_swiglu_in_state_dict(
-                model, state_dict["model"], None
-            )
+            model_state_dict, _ = handle_swiglu_in_state_dict(model, state_dict["model"], None)
             state_dict["model"] = model_state_dict
     if args.num_experts:
         state_dict["model"] = handle_experts_in_state_dict(state_dict["model"], args.num_experts)
@@ -1096,11 +1300,13 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):
         """[num_splits * np * hn, h]
         -->(view) [num_splits, np, hn, h]
         -->(tranpose) [np, num_splits, hn, h]
-        -->(view) [np * num_splits * hn, h] """
+        -->(view) [np * num_splits * hn, h]"""
 
-        intermediate_shape = \
-            (num_splits, num_attention_heads_per_partition,
-             hidden_size_per_attention_head) + input_shape[1:]
+        intermediate_shape = (
+            num_splits,
+            num_attention_heads_per_partition,
+            hidden_size_per_attention_head,
+        ) + input_shape[1:]
 
         t = t.view(*intermediate_shape)
         t = t.transpose(0, 1).contiguous()
@@ -1108,12 +1314,13 @@ def _transpose_first_dim(t, num_splits, num_splits_first, model):
         """[np * hn * num_splits, h]
         -->(view) [np, hn, num_splits, h]
         -->(tranpose) [np, num_splits, hn, h]
-        -->(view) [np * num_splits * hn, h] """
+        -->(view) [np * num_splits * hn, h]"""
 
-        intermediate_shape = \
-            (num_attention_heads_per_partition,
-             hidden_size_per_attention_head, num_splits) +\
-             input_shape[1:]
+        intermediate_shape = (
+            num_attention_heads_per_partition,
+            hidden_size_per_attention_head,
+            num_splits,
+        ) + input_shape[1:]
 
         t = t.view(*intermediate_shape)
         t = t.transpose(1, 2).contiguous()
@@ -1128,7 +1335,7 @@ def fix_query_key_value_ordering(model, checkpoint_version):
     """
     if checkpoint_version < 2.0:
         if isinstance(model, list):
-            assert len(model)==1
+            assert len(model) == 1
             model = model[0]
         for name, param in model.named_parameters():
             if name.endswith(('.query_key_value.weight', '.query_key_value.bias')):
@@ -1149,8 +1356,10 @@ def fix_query_key_value_ordering(model, checkpoint_version):
                     print_rank_0(f"Invalid checkpoint version {checkpoint_version}.")
                     sys.exit()
                 param.data.copy_(fixed_param)
-        print_rank_0(" successfully fixed query-key-values ordering for"
-                     " checkpoint version {}".format(checkpoint_version))
+        print_rank_0(
+            " successfully fixed query-key-values ordering for"
+            " checkpoint version {}".format(checkpoint_version)
+        )
 
 
 def _get_non_persistent_iteration(non_persistent_global_dir, args, checkpointing_context=None):
@@ -1170,8 +1379,10 @@ def _get_non_persistent_iteration(non_persistent_global_dir, args, checkpointing
     elif args.non_persistent_ckpt_type == "local":
         return checkpointing_context['local_checkpoint_manager'].find_latest()
     else:
-        assert False, 'Please use local or global non-persistent checkpoints' \
+        assert False, (
+            'Please use local or global non-persistent checkpoints'
             f'(got: {args.non_persistent_ckpt_type})'
+        )
 
 
 def _load_non_persistent_base_checkpoint(
@@ -1182,7 +1393,7 @@ def _load_non_persistent_base_checkpoint(
     non_persistent_iteration,
     checkpointing_context=None,
 ):
-    """ Load the base state_dict from a non-persistent distributed checkpoint.
+    """Load the base state_dict from a non-persistent distributed checkpoint.
     Depending on the non_persistent_ckpt_type, different logic may be required.
     """
     assert args.non_persistent_ckpt_type is not None
@@ -1192,8 +1403,13 @@ def _load_non_persistent_base_checkpoint(
                 f'Loading from a non-persistent checkpoint (non-persistent iter {non_persistent_iteration})'
             )
         return _load_global_dist_base_checkpoint(
-            non_persistent_global_dir, args, rank0, sharded_state_dict, non_persistent_iteration, False,
-            checkpointing_context=checkpointing_context
+            non_persistent_global_dir,
+            args,
+            rank0,
+            sharded_state_dict,
+            non_persistent_iteration,
+            False,
+            checkpointing_context=checkpointing_context,
         )
     elif args.non_persistent_ckpt_type == "local":
         intermediate_state_dict, checkpoint_name = checkpointing_context[
@@ -1202,17 +1418,19 @@ def _load_non_persistent_base_checkpoint(
         state_dict = intermediate_state_dict.to_state_dict(
             sharded_state_dict,
             algo=args.non_persistent_local_ckpt_algo,
-            parallelization_group = mpu.get_data_parallel_group(with_context_parallel=True)
+            parallelization_group=mpu.get_data_parallel_group(with_context_parallel=True),
         )
         return state_dict, checkpoint_name, False, CheckpointType.LOCAL
     else:
-        raise NotImplementedError(f"Please use local or global non-persistent checkpoints (got: {args.non_persistent_ckpt_type})")
+        raise NotImplementedError(
+            f"Please use local or global non-persistent checkpoints (got: {args.non_persistent_ckpt_type})"
+        )
 
 
 def _load_global_dist_base_checkpoint(
     load_dir, args, rank0, sharded_state_dict, iteration, release, checkpointing_context=None
 ):
-    """ Load the base state_dict from the given directory containing the global distributed checkpoint """
+    """Load the base state_dict from the given directory containing the global distributed checkpoint"""
     if rank0:
         checkpoint_name = find_checkpoint_rank_0(load_dir, iteration, release)
         state_dict = dist_checkpointing.load_common_state_dict(checkpoint_name)
@@ -1236,7 +1454,9 @@ def _load_global_dist_base_checkpoint(
         elif args.ckpt_fully_parallel_load_process_group == 'ep_dp':
             process_group = mpu.get_expert_data_parallel_group()
         else:
-            raise ValueError(f"Invalid load process group: {args.ckpt_fully_parallel_load_process_group}")
+            raise ValueError(
+                f"Invalid load process group: {args.ckpt_fully_parallel_load_process_group}"
+            )
 
         load_strategy = FullyParallelLoadStrategyWrapper(
             load_strategy, process_group, exchange_algo=args.ckpt_fully_parallel_load_exchange_algo
@@ -1244,10 +1464,7 @@ def _load_global_dist_base_checkpoint(
     if checkpointing_context is not None:
         checkpointing_context["load_strategy"] = load_strategy
     state_dict = dist_checkpointing.load(
-        sharded_state_dict,
-        checkpoint_name,
-        load_strategy,
-        strict=args.dist_ckpt_strictness,
+        sharded_state_dict, checkpoint_name, load_strategy, strict=args.dist_ckpt_strictness
     )
     return state_dict, checkpoint_name, release, CheckpointType.GLOBAL
 
@@ -1279,13 +1496,9 @@ def _get_checkpoint_format(checkpoint_name, args):
 
 
 def _load_base_checkpoint(
-    load_dir,
-    args,
-    rank0=False,
-    sharded_state_dict=None,
-    checkpointing_context=None,
+    load_dir, args, rank0=False, sharded_state_dict=None, checkpointing_context=None
 ):
-    """ Load the base state_dict from the given directory
+    """Load the base state_dict from the given directory
 
     If rank0 is true, just loads rank 0 checkpoint, ignoring arguments.
     """
@@ -1359,7 +1572,13 @@ def _load_base_checkpoint(
     # Handle global distributed checkpoint
     if ckpt_format == "torch_dist":
         return _load_global_dist_base_checkpoint(
-            load_dir, args, rank0, sharded_state_dict, iteration, release, checkpointing_context=checkpointing_context
+            load_dir,
+            args,
+            rank0,
+            sharded_state_dict,
+            iteration,
+            release,
+            checkpointing_context=checkpointing_context,
         )
     elif ckpt_format == "torch":
         ckpt_type = CheckpointType.LEGACY
@@ -1367,7 +1586,9 @@ def _load_base_checkpoint(
         if rank0:
             checkpoint_name = find_checkpoint_rank_0(load_dir, iteration, release)
         else:
-            checkpoint_name = get_checkpoint_name(load_dir, iteration, release, return_base_dir=False)
+            checkpoint_name = get_checkpoint_name(
+                load_dir, iteration, release, return_base_dir=False
+            )
         try:
             state_dict = torch.load(checkpoint_name, map_location='cpu')
         except ModuleNotFoundError:
@@ -1376,7 +1597,9 @@ def _load_base_checkpoint(
             # For backward compatibility.
             if not rank0:
                 print_rank_0(' > deserializing using the old code structure ...')
-            sys.modules['fp16.loss_scaler'] = sys.modules['megatron.legacy.fp16_deprecated.loss_scaler']
+            sys.modules['fp16.loss_scaler'] = sys.modules[
+                'megatron.legacy.fp16_deprecated.loss_scaler'
+            ]
             sys.modules['megatron.fp16.loss_scaler'] = sys.modules[
                 'megatron.legacy.fp16_deprecated.loss_scaler'
             ]
@@ -1396,10 +1619,7 @@ def _load_base_checkpoint(
             # _load_base_checkpoint is called from load_args_from_checkpoint. torch.distributed is not initialized.
             # Load only metadata.
             state_dict = {"args": None, "iteration": None}
-            torch.distributed.checkpoint.load(
-                state_dict=state_dict,
-                checkpoint_id=checkpoint_name,
-            )
+            torch.distributed.checkpoint.load(state_dict=state_dict, checkpoint_id=checkpoint_name)
         else:
             # _load_base_checkpoint is called from load_checkpoint with a proper state dict.
             state_dict = sharded_state_dict
@@ -1407,8 +1627,7 @@ def _load_base_checkpoint(
             fs_storage_reader = torch.distributed.checkpoint.FileSystemReader(checkpoint_name)
 
             torch.distributed.checkpoint.load_state_dict(
-                state_dict=state_dict,
-                storage_reader=fs_storage_reader,
+                state_dict=state_dict, storage_reader=fs_storage_reader
             )
     elif ckpt_format == "fsdp_dtensor":
         assert HAVE_MEGATRON_FSDP, "Should not be called if Megatron-FSDP is not available."
@@ -1416,7 +1635,9 @@ def _load_base_checkpoint(
             return {}, checkpoint_name, release, CheckpointType.FSDP_DTENSOR
 
         state_dict = sharded_state_dict
-        raw_optimizer_state_dict = state_dict["optimizer"].copy() if "optimizer" in state_dict else None
+        raw_optimizer_state_dict = (
+            state_dict["optimizer"].copy() if "optimizer" in state_dict else None
+        )
         raw_model_state_dict = state_dict["model"].copy() if "model" in state_dict else None
         model = state_dict.pop("_model")
         state_dict = preprocess_fsdp_dtensor_state_dict(args, state_dict, model[0])
@@ -1428,14 +1649,13 @@ def _load_base_checkpoint(
             state_dict_metadata = fs_storage_reader.read_metadata().state_dict_metadata
             rank = torch.distributed.get_rank()
             import time as _time
+
             _time.sleep(rank * 0.001)  # Make that logs of different ranks do not overlap
             print_diff_in_state_dicts(state_dict_metadata, state_dict)
 
         planner = default_planner.DefaultLoadPlanner(allow_partial_load=allow_partial_load)
         torch.distributed.checkpoint.load_state_dict(
-            state_dict=state_dict,
-            storage_reader=fs_storage_reader,
-            planner=planner,
+            state_dict=state_dict, storage_reader=fs_storage_reader, planner=planner
         )
 
         if raw_optimizer_state_dict is not None:
@@ -1449,9 +1669,7 @@ def _load_base_checkpoint(
     return state_dict, checkpoint_name, release, ckpt_type
 
 
-def load_args_from_checkpoint(
-    args, load_arg='load', checkpointing_context=None
-):
+def load_args_from_checkpoint(args, load_arg='load', checkpointing_context=None):
     """Set required arguments from the checkpoint specified in the
     arguments.
 
@@ -1471,10 +1689,7 @@ def load_args_from_checkpoint(
         return args
 
     state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
-        load_dir,
-        args,
-        rank0=True,
-        checkpointing_context=checkpointing_context,
+        load_dir, args, rank0=True, checkpointing_context=checkpointing_context
     )
 
     # Args.
@@ -1497,10 +1712,13 @@ def load_args_from_checkpoint(
         )
 
     # Backward compat: old checkpoints have hybrid_override_pattern but not hybrid_layer_pattern
-    if (getattr(checkpoint_args, 'hybrid_override_pattern', None) is not None
-            and getattr(checkpoint_args, 'hybrid_layer_pattern', None) is None):
+    if (
+        getattr(checkpoint_args, 'hybrid_override_pattern', None) is not None
+        and getattr(checkpoint_args, 'hybrid_layer_pattern', None) is None
+    ):
         setattr(
-            checkpoint_args, 'hybrid_layer_pattern',
+            checkpoint_args,
+            'hybrid_layer_pattern',
             getattr(checkpoint_args, 'hybrid_override_pattern'),
         )
         # num_layers is now derived from hybrid_layer_pattern in validate_args, and should not be
@@ -1568,6 +1786,8 @@ def _set_arg(arg_name, old_arg_name=None, force=False):
     _set_arg('moe_token_dispatcher_type', force=False)
     _set_arg('moe_router_pre_softmax', force=True)
     _set_arg('moe_grouped_gemm', force=True)
+    _set_arg('moe_single_grouped_weight', force=True)
+    _set_arg('moe_single_grouped_bias', force=True)
     _set_arg('moe_shared_expert_intermediate_size', force=True)
     _set_arg('moe_router_score_function', force=True)
     _set_arg('moe_router_enable_expert_bias', force=True)
@@ -1614,8 +1834,18 @@ def _set_arg(arg_name, old_arg_name=None, force=False):
     return args, checkpoint_args
 
 
-def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load', strict=True,
-                    checkpointing_context=None, skip_load_to_model_and_opt=False, tp_group: Optional[torch.distributed.ProcessGroup] = None, pp_group: Optional[torch.distributed.ProcessGroup] = None, dp_cp_group: Optional[torch.distributed.ProcessGroup] = None):
+def load_checkpoint(
+    ddp_model,
+    optimizer,
+    opt_param_scheduler,
+    load_arg='load',
+    strict=True,
+    checkpointing_context=None,
+    skip_load_to_model_and_opt=False,
+    tp_group: Optional[torch.distributed.ProcessGroup] = None,
+    pp_group: Optional[torch.distributed.ProcessGroup] = None,
+    dp_cp_group: Optional[torch.distributed.ProcessGroup] = None,
+):
     """Load a model checkpoint and return the iteration.
     strict (bool): whether to strictly enforce that the keys in
         :attr:`state_dict` of the checkpoint match the names of
@@ -1644,10 +1874,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
     ckpt_format = args.ckpt_format
     if args.auto_detect_ckpt_format or ckpt_format == "torch_dist":
         state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
-            load_dir,
-            args,
-            rank0=True,
-            checkpointing_context=checkpointing_context,
+            load_dir, args, rank0=True, checkpointing_context=checkpointing_context
         )
 
         ckpt_format = None
@@ -1660,7 +1887,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
         elif ckpt_type in [CheckpointType.LOCAL, CheckpointType.GLOBAL]:
             ckpt_format = "torch_dist"
         elif ckpt_type == None:
-            pass    # Not loaded.
+            pass  # Not loaded.
         else:
             raise NotImplementedError(f"checkpoint format {ckpt_format} not supported")
 
@@ -1681,10 +1908,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
             getattr(ckpt_args, "tensor_model_parallel_size", 1),
             getattr(ckpt_args, "pipeline_model_parallel_size", 1),
         )
-        run_tp_pp = (
-            args.tensor_model_parallel_size,
-            args.pipeline_model_parallel_size,
-        )
+        run_tp_pp = (args.tensor_model_parallel_size, args.pipeline_model_parallel_size)
 
         ckpt_world_size = getattr(ckpt_args, 'world_size', 0)
         run_world_size = getattr(args, 'world_size', 0)
@@ -1695,12 +1919,19 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
         )
 
         # Determine if RNG state will be loaded
-        if (ckpt_tp_pp == run_tp_pp and not release and not args.finetune and not args.no_load_rng
-                and not getattr(ckpt_args, 'no_save_rng', False)):
+        if (
+            ckpt_tp_pp == run_tp_pp
+            and not release
+            and not args.finetune
+            and not args.no_load_rng
+            and not getattr(ckpt_args, 'no_save_rng', False)
+        ):
             if tp_group is None and pp_group is None:
                 tp_group = mpu.get_tensor_model_parallel_group()
                 pp_group = mpu.get_pipeline_model_parallel_group()
-            gen_sd_rng_state = get_rng_state(args.ckpt_format, tp_group, pp_group)  # we can load the rng state
+            gen_sd_rng_state = get_rng_state(
+                args.ckpt_format, tp_group, pp_group
+            )  # we can load the rng state
         else:
             ignore_rng_state = True
             gen_sd_rng_state = None
@@ -1710,12 +1941,20 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
         if ckpt_type == CheckpointType.LOCAL:
             sharded_sd_metadata = _build_sharded_state_dict_metadata(args)
         else:
-            sharded_sd_metadata = dist_checkpointing.load_content_metadata(preloaded_state_dict=state_dict)
-        print_rank_0(f'sharded_state_dict metadata loaded from the checkpoint: {sharded_sd_metadata}')
+            sharded_sd_metadata = dist_checkpointing.load_content_metadata(
+                preloaded_state_dict=state_dict
+            )
+        print_rank_0(
+            f'sharded_state_dict metadata loaded from the checkpoint: {sharded_sd_metadata}'
+        )
 
         # Determine if optimizer state will be loaded
-        if (not release and not args.finetune and not args.no_load_optim
-                and not getattr(ckpt_args, 'no_save_optim', False)):
+        if (
+            not release
+            and not args.finetune
+            and not args.no_load_optim
+            and not getattr(ckpt_args, 'no_save_optim', False)
+        ):
             gen_sd_optim = optimizer
             gen_sd_opt_param_scheduler = opt_param_scheduler
 
@@ -1725,23 +1964,33 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
                     # Can be removed after ending support for MLM optimizer checkpoints with MCore < v0.13
                     # (for MCore v0.13+ checkpoints `sharded_sd_metadata is not None`)
                     sharded_sd_metadata = {
-                        'distrib_optim_sharding_type': ('fully_sharded_model_space'
-                                                        if getattr(ckpt_args, 'ckpt_fully_parallel_save', False)
-                                                        else 'dp_zero_gather_scatter'),
+                        'distrib_optim_sharding_type': (
+                            'fully_sharded_model_space'
+                            if getattr(ckpt_args, 'ckpt_fully_parallel_save', False)
+                            else 'dp_zero_gather_scatter'
+                        )
                     }
                 if (
                     ckpt_tp_pp != run_tp_pp
                     and sharded_sd_metadata['distrib_optim_sharding_type']
                     not in DistributedOptimizer.checkpoint_fully_reshardable_formats
                 ):
-                    raise RuntimeError(f"{mismatch_msg}: not supported for DistributedOptimizer with sharding type"
-                                       f" {sharded_sd_metadata['distrib_optim_sharding_type']}."
-                                       f" Please use `--ckpt-fully-parallel-save` flag during checkpoint saving.")
+                    raise RuntimeError(
+                        f"{mismatch_msg}: not supported for DistributedOptimizer with sharding type"
+                        f" {sharded_sd_metadata['distrib_optim_sharding_type']}."
+                        f" Please use `--ckpt-fully-parallel-save` flag during checkpoint saving."
+                    )
 
                 # Check if fully parallel load is compatible with sharding type
-                if args.ckpt_fully_parallel_load and sharded_sd_metadata['distrib_optim_sharding_type'] == 'dp_zero_gather_scatter':
-                    raise RuntimeError("Fully parallel load is not supported for dp_zero_gather_scatter checkpoints. "
-                                       "Please remove --ckpt-fully-parallel-load flag")
+                if (
+                    args.ckpt_fully_parallel_load
+                    and sharded_sd_metadata['distrib_optim_sharding_type']
+                    == 'dp_zero_gather_scatter'
+                ):
+                    raise RuntimeError(
+                        "Fully parallel load is not supported for dp_zero_gather_scatter checkpoints. "
+                        "Please remove --ckpt-fully-parallel-load flag"
+                    )
         else:
             gen_sd_optim = None
             gen_sd_opt_param_scheduler = None
@@ -1771,27 +2020,30 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
             rerun_state_machine = get_rerun_state_machine()
             if rerun_state_machine.validate_state_dict(state_dict['rerun_state_machine']):
                 gen_sd_rerun_state = rerun_state_machine.state_dict(
-                    data_iterator=None, ckpt_format=ckpt_format, force=True,
+                    data_iterator=None, ckpt_format=ckpt_format, force=True
                 )
                 ignore_rerun_state = False
-        if (
-            ckpt_world_size != run_world_size
-            or ckpt_tp_pp != run_tp_pp
-            or ckpt_dp != run_dp
-        ):
+        if ckpt_world_size != run_world_size or ckpt_tp_pp != run_tp_pp or ckpt_dp != run_dp:
             print_rank_0("Job sharding has changed: Rerun state will be ignored")
 
         # [ModelOpt]: Initial loading from non-resume sharded checkpoint to a Distillation Model
         # will result in key mismatch with loss modules potentially containing parameters, since
         # it requires generating a state_dict before loading. Here we hide those modules if present.
-        with contextlib.ExitStack() as stack:  # Allows multiple context managers for each model shard
+        with (
+            contextlib.ExitStack() as stack
+        ):  # Allows multiple context managers for each model shard
             if args.finetune and hasattr(model[0], "hide_loss_modules"):
                 for m in model:
                     stack.enter_context(m.hide_loss_modules())
             load_kwargs['sharded_state_dict'] = generate_state_dict(
-                args, model, gen_sd_optim, gen_sd_opt_param_scheduler, gen_sd_rng_state,
-                optim_sd_kwargs=optim_sd_kwargs, model_sd_kwargs=model_sd_kwargs,
-                rerun_state=gen_sd_rerun_state
+                args,
+                model,
+                gen_sd_optim,
+                gen_sd_opt_param_scheduler,
+                gen_sd_rng_state,
+                optim_sd_kwargs=optim_sd_kwargs,
+                model_sd_kwargs=model_sd_kwargs,
+                rerun_state=gen_sd_rerun_state,
             )
     elif args.ckpt_format == "torch_dcp":
         model_sd = model[0].state_dict()
@@ -1824,7 +2076,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
         if not args.finetune:
             if "rerun_state_machine" in state_dict_metadata:
                 gen_sd_rerun_state = get_rerun_state_machine().state_dict(
-                    data_iterator=None, ckpt_format=ckpt_format, force=True,
+                    data_iterator=None, ckpt_format=ckpt_format, force=True
                 )
             if not args.no_load_rng:
                 gen_sd_rng_state = get_rng_state(args.ckpt_format, tp_group, pp_group)
@@ -1848,8 +2100,7 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
         load_kwargs["sharded_state_dict"] = state_dict
 
     state_dict, checkpoint_name, release, ckpt_type = _load_base_checkpoint(
-        load_dir, args, rank0=False, checkpointing_context=checkpointing_context,
-        **load_kwargs
+        load_dir, args, rank0=False, checkpointing_context=checkpointing_context, **load_kwargs
     )
 
     # Checkpoint not loaded.
@@ -1875,8 +2126,10 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
             try:  # Backward compatible with older checkpoints
                 iteration = state_dict['total_iters']
             except KeyError:
-                print_rank_0('A metadata file exists but unable to load '
-                             'iteration from checkpoint {}, exiting'.format(checkpoint_name))
+                print_rank_0(
+                    'A metadata file exists but unable to load '
+                    'iteration from checkpoint {}, exiting'.format(checkpoint_name)
+                )
                 sys.exit()
     num_floating_point_operations_so_far = state_dict.get('num_floating_point_operations_so_far', 0)
 
@@ -1884,13 +2137,10 @@ def load_checkpoint(ddp_model, optimizer, opt_param_scheduler, load_arg='load',
     if 'args' in state_dict and not args.finetune:
         checkpoint_args = state_dict['args']
         check_checkpoint_args(checkpoint_args)
-        args.consumed_train_samples = getattr(checkpoint_args,
-                                              'consumed_train_samples', 0)
-        args.skipped_train_samples = getattr(checkpoint_args,
-                                             'skipped_train_samples', 0)
+        args.consumed_train_samples = getattr(checkpoint_args, 'consumed_train_samples', 0)
+        args.skipped_train_samples = getattr(checkpoint_args, 'skipped_train_samples', 0)
         update_num_microbatches(consumed_samples=args.consumed_train_samples, verbose=True)
-        args.consumed_valid_samples = getattr(checkpoint_args,
-                                              'consumed_valid_samples', 0)
+        args.consumed_valid_samples = getattr(checkpoint_args, 'consumed_valid_samples', 0)
     else:
         print_rank_0('could not find arguments in the checkpoint ...')
 
@@ -1903,6 +2153,7 @@ def load_model_state_dict(module, state_dict, strict: bool):
                 # Fallback support for backward compatibility breaking changes in TransformerEngine
                 load_return = module.load_state_dict(state_dict, strict=False)
                 print(f"load_return: {load_return}")
+
     # Model.
     if not skip_load_to_model_and_opt:
         if len(ddp_model) == 1:
@@ -1923,42 +2174,58 @@ def load_model_state_dict(module, state_dict, strict: bool):
     if not release and not args.finetune and not args.no_load_optim:
         try:
             # Load state dict.
-            if getattr(args, "use_layer_wise_distributed_optimizer", False) and args.ckpt_format == 'torch':
+            if (
+                getattr(args, "use_layer_wise_distributed_optimizer", False)
+                and args.ckpt_format == 'torch'
+            ):
                 # LayerWiseDistributedOptimizer load optimizer state from file on different ranks
                 dp_rank = mpu.get_data_parallel_rank()
-                optim_checkpoint_name = os.path.join(os.path.dirname(checkpoint_name), f"layer_wise_optimizer_{dp_rank}.pt")
+                optim_checkpoint_name = os.path.join(
+                    os.path.dirname(checkpoint_name), f"layer_wise_optimizer_{dp_rank}.pt"
+                )
                 optimizer.load_state_dict_from_file(optim_checkpoint_name)
-            elif not skip_load_to_model_and_opt and optimizer is not None and not optimizer.is_stub_optimizer:
+            elif (
+                not skip_load_to_model_and_opt
+                and optimizer is not None
+                and not optimizer.is_stub_optimizer
+            ):
                 optimizer.load_state_dict(state_dict['optimizer'])
 
             # Load distributed optimizer's custom parameter state.
             # For distributed checkpoint it's already loaded in load_state_dict above
             is_torch_dist = ckpt_format == "torch_dist"
-            if args.use_distributed_optimizer and not is_torch_dist and ckpt_format not in ["torch_dcp", "fsdp_dtensor"]:
+            if (
+                args.use_distributed_optimizer
+                and not is_torch_dist
+                and ckpt_format not in ["torch_dcp", "fsdp_dtensor"]
+            ):
                 # NOTE: this is a manual read of the tracker file.
                 # This code should not be reached when reading from a non_persistent checkpoint
                 assert not is_torch_dist
                 tracker_filename = get_checkpoint_tracker_filename(load_dir)
                 iteration, release = read_metadata(tracker_filename)
-                model_checkpoint_name = \
-                    get_checkpoint_name(load_dir, iteration, release)
-                optim_checkpoint_name = \
-                    get_distributed_optimizer_checkpoint_name(
-                        model_checkpoint_name)
-                optimizer.load_parameter_state(optim_checkpoint_name,
-                                               update_legacy_format=args.ckpt_convert_update_legacy_dist_opt_format)
+                model_checkpoint_name = get_checkpoint_name(load_dir, iteration, release)
+                optim_checkpoint_name = get_distributed_optimizer_checkpoint_name(
+                    model_checkpoint_name
+                )
+                optimizer.load_parameter_state(
+                    optim_checkpoint_name,
+                    update_legacy_format=args.ckpt_convert_update_legacy_dist_opt_format,
+                )
 
             # Load scheduler.
             if opt_param_scheduler is not None:
-                if 'lr_scheduler' in state_dict: # backward compatbility
+                if 'lr_scheduler' in state_dict:  # backward compatbility
                     opt_param_scheduler.load_state_dict(state_dict['lr_scheduler'])
                 else:
                     opt_param_scheduler.load_state_dict(state_dict['opt_param_scheduler'])
         except KeyError as e:
-            print_rank_0('Unable to load optimizer from checkpoint {}. '
-                         'Specify --no-load-optim or --finetune to prevent '
-                         'attempting to load the optimizer state, '
-                         'exiting ...'.format(checkpoint_name))
+            print_rank_0(
+                'Unable to load optimizer from checkpoint {}. '
+                'Specify --no-load-optim or --finetune to prevent '
+                'attempting to load the optimizer state, '
+                'exiting ...'.format(checkpoint_name)
+            )
             raise e
     else:
         if (args.fp16 or args.bf16) and optimizer is not None:
@@ -2023,34 +2290,37 @@ def load_model_state_dict(module, state_dict, strict: bool):
                 }
             cuda_rng_tracker.set_states(rng_tracker_states)
         except KeyError:
-            print_rank_0('Unable to load rng state from checkpoint {}. '
-                         'Specify --no-load-rng or --finetune to prevent '
-                         'attempting to load the rng state, '
-                         'exiting ...'.format(checkpoint_name))
+            print_rank_0(
+                'Unable to load rng state from checkpoint {}. '
+                'Specify --no-load-rng or --finetune to prevent '
+                'attempting to load the rng state, '
+                'exiting ...'.format(checkpoint_name)
+            )
             sys.exit()
 
     # Some utilities want to load a checkpoint without distributed being initialized
     if torch.distributed.is_initialized():
         torch.distributed.barrier()
 
-    print_rank_0(f'  successfully loaded checkpoint from {load_dir} '
-                 f'[ t {mpu.get_tensor_model_parallel_rank() + 1}/{mpu.get_tensor_model_parallel_world_size()}, '
-                 f'p {mpu.get_pipeline_model_parallel_rank() + 1}/{mpu.get_pipeline_model_parallel_world_size()} ] '
-                 f'at iteration {iteration}')
-                 
+    print_rank_0(
+        f'  successfully loaded checkpoint from {load_dir} '
+        f'[ t {mpu.get_tensor_model_parallel_rank() + 1}/{mpu.get_tensor_model_parallel_world_size()}, '
+        f'p {mpu.get_pipeline_model_parallel_rank() + 1}/{mpu.get_pipeline_model_parallel_world_size()} ] '
+        f'at iteration {iteration}'
+    )
+
     if has_nvidia_modelopt:
         print_distributed_quant_summary(model, msg="After loading checkpoint")
-        
+
     # Additional callback for wandb (last rank)
-    if not torch.distributed.is_initialized() \
-       or is_last_rank():
+    if not torch.distributed.is_initialized() or is_last_rank():
         wandb_utils.on_load_checkpoint_success(checkpoint_name, load_dir)
 
     torch.cuda.empty_cache()
 
     if iteration > 0:
         # Notify FT that a checkpoint was loaded.
-        is_local_chkpt = (ckpt_type == CheckpointType.LOCAL)
+        is_local_chkpt = ckpt_type == CheckpointType.LOCAL
         ft_integration.on_checkpoint_loaded(is_local_chkpt=is_local_chkpt)
 
     # Patch checkpoint as needed if required field is not found.
@@ -2060,7 +2330,9 @@ def load_model_state_dict(module, state_dict, strict: bool):
             if 'default_config' not in param_group:
                 param_group['default_config'] = True
                 if not log_printed:
-                    print_rank_0(">>> Inserting 'default_config' field into optimizer.param_groups...")
+                    print_rank_0(
+                        ">>> Inserting 'default_config' field into optimizer.param_groups..."
+                    )
                 log_printed = True
 
     return iteration, num_floating_point_operations_so_far
@@ -2080,8 +2352,9 @@ def _to_dtensor(wrapped_model, model_state_dict):
     return new_model_sd
 
 
-def load_biencoder_checkpoint(model, only_query_model=False,
-                              only_context_model=False, custom_load_path=None):
+def load_biencoder_checkpoint(
+    model, only_query_model=False, only_context_model=False, custom_load_path=None
+):
     """
     selectively load retrieval models for indexing/retrieving
     from saved checkpoints
@@ -2098,13 +2371,16 @@ def load_biencoder_checkpoint(model, only_query_model=False,
     with open_file(tracker_filename, 'r') as f:
         iteration = int(f.read().strip())
 
-    checkpoint_name = get_checkpoint_name(load_path, iteration,
-                                          args.use_distributed_optimizer,
-                                          release=False)
+    checkpoint_name = get_checkpoint_name(
+        load_path, iteration, args.use_distributed_optimizer, release=False
+    )
 
     if mpu.get_data_parallel_rank() == 0:
-        print('global rank {} is loading checkpoint {}'.format(
-            torch.distributed.get_rank(), checkpoint_name))
+        print(
+            'global rank {} is loading checkpoint {}'.format(
+                torch.distributed.get_rank(), checkpoint_name
+            )
+        )
 
     state_dict = torch.load(checkpoint_name, map_location='cpu')
     ret_state_dict = state_dict['model']
diff --git a/megatron/training/config/common_config.py b/megatron/training/config/common_config.py
index cb61f50c13c..2107816bd85 100644
--- a/megatron/training/config/common_config.py
+++ b/megatron/training/config/common_config.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+import os
 from dataclasses import dataclass, field
 from typing import Literal
-import os
+
 
 @dataclass(kw_only=True)
 class RNGConfig:
@@ -25,7 +26,9 @@ class RNGConfig:
 class ProfilingConfig:
     """Configuration settings for profiling the training process."""
 
-    use_nsys_profiler: bool = field(default=False, metadata={"argparse_meta": {"arg_names": ["--profile"], "dest": "profile"}})
+    use_nsys_profiler: bool = field(
+        default=False, metadata={"argparse_meta": {"arg_names": ["--profile"], "dest": "profile"}}
+    )
     """Enable nsys profiling. When using this option, nsys options should be specified in
     commandline. An example nsys commandline is
     `nsys profile -s none -t nvtx,cuda -o <path/to/output_file> --force-overwrite true
@@ -43,10 +46,10 @@ class ProfilingConfig:
 
     pytorch_profiler_collect_shapes: bool = False
     """Collect tensor shape in pytorch profiler."""
-  
+
     pytorch_profiler_collect_callstack: bool = False
     """Collect callstack in pytorch profiler."""
-  
+
     pytorch_profiler_collect_chakra: bool = False
     """Collect chakra trace in pytorch profiler."""
 
@@ -106,7 +109,9 @@ class DistributedInitConfig:
     """If set, distributed ranks initialize order is changed from tp-cp-ep-dp-pp to tp-cp-ep-pp-dp.
     """
 
-    use_gloo_process_groups: bool = field(default=True, metadata={"argparse_meta": {"arg_names": ["--disable-gloo-process-groups"]}})
+    use_gloo_process_groups: bool = field(
+        default=True, metadata={"argparse_meta": {"arg_names": ["--disable-gloo-process-groups"]}}
+    )
     """If enabled, create Gloo process groups for communications."""
 
     use_sharp: bool = False
diff --git a/megatron/training/config/training_config.py b/megatron/training/config/training_config.py
index 27cffb837f4..c2e30d01f6f 100644
--- a/megatron/training/config/training_config.py
+++ b/megatron/training/config/training_config.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-from dataclasses import dataclass, field
 import signal
+from dataclasses import dataclass, field
 from typing import Literal, Optional
 
+
 @dataclass(kw_only=True)
 class TrainingConfig:
     """Configuration settings related to the training loop."""
@@ -16,7 +17,9 @@ class TrainingConfig:
     data-parallel-size. If this value is None, then use micro-batch-size * data-parallel-size
     as the global batch size. This choice will result in 1 for number of micro-batches."""
 
-    rampup_batch_size: list[int] | None = field(default=None, metadata={"argparse_meta": {"nargs": 3}})
+    rampup_batch_size: list[int] | None = field(
+        default=None, metadata={"argparse_meta": {"nargs": 3}}
+    )
     """Batch size ramp up with the following values: <start batch size>, <batch size increment>,
     <ramp-up samples>
     For example:
@@ -37,7 +40,6 @@ class TrainingConfig:
     Cannot be used together with decrease_batch_size_if_needed.
     """
 
-
     decrease_batch_size_if_needed: bool = False
     """If set, decrease batch size if microbatch_size * dp_size does not 
     divide batch_size. Old batch_size will be restored if training is re-started 
@@ -192,13 +194,30 @@ class SchedulerConfig:
     """number of samples to warmup learning rate over. Calculated at runtime from
     lr_warmup_fraction, lr_warmup_iters, or lr_warmup_samples.
     """
-    
-    override_opt_param_scheduler: bool = field(default=False, metadata={"argparse_meta": {"arg_names": ["--override-opt_param-scheduler", "--override-opt-param-scheduler"]}})
+
+    override_opt_param_scheduler: bool = field(
+        default=False,
+        metadata={
+            "argparse_meta": {
+                "arg_names": ["--override-opt_param-scheduler", "--override-opt-param-scheduler"]
+            }
+        },
+    )
     """Reset the values of the scheduler (learning rate, warmup iterations, minimum learning rate,
     maximum number of iterations, and decay style) from input arguments and ignore values from
     checkpoints. Note that all the above values will be reset."""
 
-    use_checkpoint_opt_param_scheduler: bool = field(default=False, metadata={"argparse_meta": {"arg_names": ["--use-checkpoint-opt_param-scheduler", "--use-checkpoint-opt-param-scheduler"]}})
+    use_checkpoint_opt_param_scheduler: bool = field(
+        default=False,
+        metadata={
+            "argparse_meta": {
+                "arg_names": [
+                    "--use-checkpoint-opt_param-scheduler",
+                    "--use-checkpoint-opt-param-scheduler",
+                ]
+            }
+        },
+    )
     """Use checkpoint to set the values of the scheduler (learning rate, warmup iterations,
     minimum learning rate, maximum number of iterations, and decay style) from checkpoint
     and ignore input arguments."""
@@ -314,7 +333,10 @@ class LoggerConfig:
     runtime_time_unit: str = "hours"
     """Time unit to use for time logging. """
 
-    barrier_with_L1_time: bool = field(default=True, metadata={"argparse_meta": {"arg_names": ["--no-barrier-with-level-1-timing"]}})
+    barrier_with_L1_time: bool = field(
+        default=True,
+        metadata={"argparse_meta": {"arg_names": ["--no-barrier-with-level-1-timing"]}},
+    )
     """If not disabled, use barrier with level 1 time measurements. Note that this is up to the user to
     make sure calling barrier with their timers will not result in hangs. This can happen if for
     example the user adds a level 1 timer that is not called by all ranks.
@@ -361,7 +383,12 @@ class CheckpointConfig:
     save: str | None = None
     """Output directory to save checkpoints to."""
 
-    save_interval: int | None = field(default=None, metadata={"argparse_meta": {"arg_names": ["--save-interval", "--persistent-save-interval"]}})
+    save_interval: int | None = field(
+        default=None,
+        metadata={
+            "argparse_meta": {"arg_names": ["--save-interval", "--persistent-save-interval"]}
+        },
+    )
     """Number of iterations between persistent checkpoint saves."""
 
     save_params_interval: int | None = None
@@ -474,7 +501,15 @@ class CheckpointConfig:
     The legacy format was deprecated on Feb 13, 2024.
     """
 
-    fully_parallel_save: bool = field(default=True, metadata={"argparse_meta": {"arg_names": ["--no-ckpt-fully-parallel-save"], "dest": "ckpt_fully_parallel_save"}})
+    fully_parallel_save: bool = field(
+        default=True,
+        metadata={
+            "argparse_meta": {
+                "arg_names": ["--no-ckpt-fully-parallel-save"],
+                "dest": "ckpt_fully_parallel_save",
+            }
+        },
+    )
     """Disable applying full save parallelization across DP for distributed checkpoints.
     Depending on ckpt format might decrease the number of files in the checkpoint.
     Makes DistributedOptimizer checkpoint non-reshardable."""
@@ -504,10 +539,20 @@ class CheckpointConfig:
     subprocess. Useful on MNNVL systems where fabric resources are exhausted.
     Only applies with the nvrx async strategy."""
 
-    fully_parallel_load: bool = field(default=False, metadata={"argparse_meta": {"arg_names": ["--ckpt-fully-parallel-load"], "dest": "ckpt_fully_parallel_load"}})
+    fully_parallel_load: bool = field(
+        default=False,
+        metadata={
+            "argparse_meta": {
+                "arg_names": ["--ckpt-fully-parallel-load"],
+                "dest": "ckpt_fully_parallel_load",
+            }
+        },
+    )
     """Apply full load parallelization across DP for distributed checkpoints."""
 
-    ckpt_fully_parallel_load_exchange_algo: Literal["broadcast", "gather_rounds", "gather_object"] = "broadcast"
+    ckpt_fully_parallel_load_exchange_algo: Literal[
+        "broadcast", "gather_rounds", "gather_object"
+    ] = "broadcast"
     """Algorithm for fully parallel load of distributed checkpoints.
     "broadcast"(default): Broadcast the checkpoint from rank 0 to all other ranks.
     "gather_rounds": Gather the checkpoint from all ranks in rounds.
@@ -577,8 +622,10 @@ class CheckpointConfig:
     def __post_init__(self):
         from megatron.training.utils import has_nvrx_installed
 
-        assert self.async_strategy in ["nvrx", "mcore"], \
-            f"async_strategy {self.async_strategy} is not supported. Available strategies: nvrx, mcore."
+        assert self.async_strategy in [
+            "nvrx",
+            "mcore",
+        ], f"async_strategy {self.async_strategy} is not supported. Available strategies: nvrx, mcore."
 
         if self.async_save and self.ckpt_format in ["torch_dcp", "fsdp_dtensor"]:
             assert has_nvrx_installed(), (
diff --git a/megatron/training/datasets/data_samplers.py b/megatron/training/datasets/data_samplers.py
index 7a31d846a49..efac0250ae8 100644
--- a/megatron/training/datasets/data_samplers.py
+++ b/megatron/training/datasets/data_samplers.py
@@ -11,7 +11,6 @@
 
 from megatron.core import mpu
 from megatron.core.datasets.utils import Split
-
 from megatron.training import get_args
 from megatron.training.dist_signal_handler import DistributedSignalHandler
 
@@ -37,8 +36,16 @@ def build_pretraining_data_loader(dataset, consumed_samples):
 
     # Use eval-specific batch sizes for validation/test splits
     is_eval = split in (Split.valid, Split.test)
-    micro_batch_size = getattr(args, 'eval_micro_batch_size', args.micro_batch_size) if is_eval else args.micro_batch_size
-    global_batch_size = getattr(args, 'eval_global_batch_size', args.global_batch_size) if is_eval else args.global_batch_size
+    micro_batch_size = (
+        getattr(args, 'eval_micro_batch_size', args.micro_batch_size)
+        if is_eval
+        else args.micro_batch_size
+    )
+    global_batch_size = (
+        getattr(args, 'eval_global_batch_size', args.global_batch_size)
+        if is_eval
+        else args.global_batch_size
+    )
 
     if split == Split.valid and args.full_validation:
         batch_sampler = MegatronPretrainingSampler(
@@ -56,7 +63,8 @@ def build_pretraining_data_loader(dataset, consumed_samples):
                 micro_batch_size=micro_batch_size,
                 global_batch_size=global_batch_size,
                 data_parallel_rank=mpu.get_data_parallel_rank(),
-                data_parallel_size=mpu.get_data_parallel_world_size())
+                data_parallel_size=mpu.get_data_parallel_world_size(),
+            )
         else:
             # Megatron sampler
             batch_sampler = MegatronPretrainingSampler(
@@ -64,7 +72,8 @@ def build_pretraining_data_loader(dataset, consumed_samples):
                 consumed_samples=consumed_samples,
                 micro_batch_size=micro_batch_size,
                 data_parallel_rank=mpu.get_data_parallel_rank(),
-                data_parallel_size=mpu.get_data_parallel_world_size())
+                data_parallel_size=mpu.get_data_parallel_world_size(),
+            )
     elif args.dataloader_type == 'cyclic':
         batch_sampler = MegatronPretrainingRandomSampler(
             dataset,
@@ -97,12 +106,10 @@ def close_nvidia_fds():
         if args.exit_signal_handler:
             DistributedSignalHandler(args.exit_signal).__enter__()
 
-    maybe_worker_init_fn = (
-        worker_init_fn if args.num_workers > 0 else None
-    )
+    maybe_worker_init_fn = worker_init_fn if args.num_workers > 0 else None
     # Torch dataloader.
-    if args.hybrid_context_parallel:
-        extra_kwargs = {"collate_fn": lambda x: x,}
+    if args.dynamic_context_parallel:
+        extra_kwargs = {"collate_fn": lambda x: x}
     else:
         extra_kwargs = {}
     return torch.utils.data.DataLoader(
@@ -115,6 +122,7 @@ def close_nvidia_fds():
         **extra_kwargs,
     )
 
+
 class MegatronPretrainingSampler:
     """
     Sampler for Megatron pretraining dataloaders that divides data samples across
@@ -184,49 +192,6 @@ def __iter__(self):
             start_idx, end_idx = self.get_start_end_idx()
             yield batch[start_idx:end_idx]
 
-class HybridCPMegatronPretrainingSampler(MegatronPretrainingSampler):
-    """
-    Data sampler for hybrid context parallel (Hybrid CP) format.
-    This data sampler pulls in the entire global batch at once across all data parallel ranks.
-    This helps provide the Hybrid CP Dataloader Wrapper to schedule and load balance sub-samples
-    of the entire global batch.
-    """
-
-    def __init__(self, total_samples, consumed_samples, micro_batch_size, global_batch_size,
-                 data_parallel_rank, data_parallel_size, drop_last=True):
-        super().__init__(total_samples, consumed_samples, micro_batch_size, data_parallel_rank, data_parallel_size, drop_last)
-        self.global_batch_size = global_batch_size
-        self.data_parallel_size = data_parallel_size
-        self.num_micro_batches = self.global_batch_size // self.micro_batch_times_data_parallel_size
-
-    def __len__(self):
-        return self.total_samples
-
-    def get_start_end_idx_global_batch(self):
-        start_idx = [self.data_parallel_rank * self.micro_batch_size + i * self.micro_batch_size * self.data_parallel_size for i in range(self.num_micro_batches)]
-        end_idx = [start_idx[i] + self.micro_batch_size for i in range(self.num_micro_batches)]
-        return start_idx, end_idx
-
-    def __iter__(self):
-        batch = []
-        # Last batch will be dropped if drop_last is not set False
-        for idx in range(self.consumed_samples, self.total_samples):
-            batch.append(idx)
-            if len(batch) == self.micro_batch_times_data_parallel_size * self.num_micro_batches:
-                start_idx, end_idx = self.get_start_end_idx_global_batch()
-                global_batch_idx = []
-                for i in range(self.num_micro_batches):
-                    global_batch_idx.extend(batch[start_idx[i]:end_idx[i]])
-                yield global_batch_idx
-                batch = []
-
-        # Check the last partial batch and see drop_last is set
-        if len(batch) > 0 and not self.drop_last:
-            start_idx, end_idx = self.get_start_end_idx_global_batch()
-            global_batch_idx = []
-            for i in range(self.num_micro_batches):
-                global_batch_idx.extend(batch[start_idx[i]:end_idx[i]])
-            yield global_batch_idx
 
 class RandomSeedDataset(Dataset):
     """
diff --git a/megatron/training/datasets/sft_dataset.py b/megatron/training/datasets/sft_dataset.py
index 9de5d2a52fe..250a0137568 100644
--- a/megatron/training/datasets/sft_dataset.py
+++ b/megatron/training/datasets/sft_dataset.py
@@ -2,12 +2,16 @@
 
 import atexit, json
 from collections import Counter
-from typing import Any, Dict, Optional
+import json
+import math
+from typing import Any, Dict, Optional, List, Union
 
 import numpy as np
+import pandas as pd
 import torch
 
 from megatron.core.datasets.gpt_dataset import GPTDatasetConfig
+from megatron.core.datasets.indexed_dataset import IndexedDataset
 from megatron.core.datasets.megatron_dataset import LowLevelDataset, MegatronDataset
 from megatron.core.datasets.utils import Split
 
@@ -88,6 +92,26 @@ def _split_conversations(self, merged_conversations):
             split_conversations.append(current)
         return split_conversations
 
+    def _calculate_padding_divisor(self) -> int:
+        """
+            Calculate the divisor used for sequence padding.
+            tp_pad = tp_size * 2 if tp_size > 1 else 1
+            cp_pad = cp_size * 2 if cp_size > 1 else 1
+            cp_pad = cp_pad * dp_size if dynamic_cp else cp_pad
+            divisor = cp_pad * tp_pad
+        """
+        if self.config.dynamic_context_parallel:
+            # Dynamic CP: consider both CP and DP
+            cp_pad = self.config.data_parallel_size * self.config.context_parallel_size * 2
+        else:
+            # Standard CP: only consider CP
+            cp_pad = self.config.context_parallel_size * 2 if self.config.context_parallel_size > 1 else 1
+        tp_pad = self.config.sequence_parallel_size if self.config.sequence_parallel_size > 0 else 1
+        divisor = cp_pad * tp_pad
+        # TODO(tailaim): do we need to pad for FP8 execution?
+        # divisor = ((divisor + 15) // 16) * 16
+        return divisor
+
     def __getitem__(self, idx: int) -> Dict[str, Any]:
 
         tokenizer = self.config.tokenizer
@@ -124,12 +148,11 @@ def extend_with_padding(tokens, targets, positions, pad_len):
             assert not self.config.reset_position_ids
             pack_positions.extend(range(len(tokens_list)))
 
-            if self.config.context_parallel_size > 1:
-                pad_granularity = self.config.context_parallel_size * 2
-                mod_token_count = len(pack_tokens) % pad_granularity
-                if mod_token_count != 0:
-                    pad_len = pad_granularity - mod_token_count
-                    extend_with_padding(pack_tokens, pack_targets, pack_positions, pad_len)
+            pad_granularity = self._calculate_padding_divisor()
+            mod_token_count = len(pack_tokens) % pad_granularity
+            if mod_token_count != 0:
+                pad_len = pad_granularity - mod_token_count
+                extend_with_padding(pack_tokens, pack_targets, pack_positions, pad_len)
 
             # TODO(duncan): Consider also padding to multiple of number of tokens here. This might
             # be needed for efficiency (and potentially set via command-line argument).
@@ -190,3 +213,214 @@ def extend_with_padding(tokens, targets, positions, pad_len):
             'cu_seqlens': cu_seqlens,
             'max_seqlen': max_seqlen,
         }
+
+
+class MockSFTLowLevelDataset:
+    """The low-level mock dataset for SFT
+
+    Args:
+        mode (str): One of 'file', 'distribution', or 'verification'.
+        **kwargs: Additional arguments depending on mode.
+            For mode='file': path (str) - path to a CSV file with sequence lengths.
+            For mode='distribution': type (str), min_seq_len (int), max_seq_len (int),
+                mean_seq_len (int), and distribution-specific params (e.g. lognormal_sigma).
+            For mode='verification': data_path (str) - prefix path to an IndexedDataset
+                (.bin/.idx files). Optional lognormal distribution params same as
+                'distribution' mode (defaults: min_seq_len=100, max_seq_len=4096,
+                mean_seq_len=2048, lognormal_sigma=1.1).
+        format (str): Output format for MockSFTDataset. Either 'thd' (default, sequence
+            packing with cu_seqlens) or 'sbhd' (padded to seq_length, no cu_seqlens).
+    """
+
+    seed: int = 0
+    """The hard-coded random seed to use to set the NumPy RNG"""
+
+    size: int = 1000000
+    """The hard-coded number of sequence to generate"""
+
+    def __init__(self, mode: str, **kwargs) -> None:
+        np.random.seed(self.seed)
+        self.format = kwargs.get("format", "thd")
+
+        if mode == "file":
+            self.sequence_lengths = np.array(pd.read_csv(kwargs["path"])).flatten()
+            self.size = len(self.sequence_lengths)
+        elif mode == "distribution":
+            min_seq_len = kwargs["min_seq_len"]
+            max_seq_len = kwargs["max_seq_len"]
+            mean_seq_len = kwargs["mean_seq_len"]
+            if kwargs["type"] == "lognormal":
+                lognormal_sigma = kwargs["lognormal_sigma"]
+                self.sequence_lengths = self.generate_lognormal_samples(
+                    self.size, mean_seq_len, lognormal_sigma, min_seq_len, max_seq_len
+                )
+            else:
+                raise ValueError(f"Unsupported distribution type {kwargs['type']}")
+        elif mode == "verification":
+            # Load real tokens from an IndexedDataset for realistic loss curves.
+            # Sequence lengths are drawn from a lognormal distribution (same as
+            # "distribution" mode) to allow controlled comparison of THD vs SBHD.
+            self.indexed_dataset = IndexedDataset(kwargs["data_path"])
+            min_seq_len = kwargs.get("min_seq_len", 100)
+            max_seq_len = kwargs.get("max_seq_len", 4096)
+            mean_seq_len = kwargs.get("mean_seq_len", 2048)
+            lognormal_sigma = kwargs.get("lognormal_sigma", 1.1)
+            self.sequence_lengths = self.generate_lognormal_samples(
+                self.size, mean_seq_len, lognormal_sigma, min_seq_len, max_seq_len
+            )
+        else:
+            raise ValueError(f"Unsupported mode '{mode}', must be 'file', 'distribution', or 'verification'")
+        
+    def generate_lognormal_samples(self, size, mean, sigma, min_seq_len, max_seq_len):   
+        mu = np.log(mean) - sigma**2 / 2
+        samples = np.random.lognormal(mu, sigma, size)
+        samples = np.clip(samples, min_seq_len, max_seq_len)
+        return samples.astype(int)   
+
+    def __len__(self) -> int:
+        return self.size
+
+    def __getitem__(self, idx: int) -> np.ndarray:
+        # The returned sample has 'length-1' tokens; an EOD token is appended
+        # later in MockSFTDataset.__getitem__, making the total 'length' tokens.
+        length = int(self.sequence_lengths[idx % self.size])
+        if hasattr(self, 'indexed_dataset'):
+            target = length - 1
+            num_docs = len(self.indexed_dataset)
+            doc_idx = idx % num_docs
+            raw = self.indexed_dataset[doc_idx]
+            if len(raw) >= target:
+                sample = raw[:target]
+            else:
+                # Concatenate documents until we reach the target length.
+                chunks = [raw]
+                total = len(raw)
+                next_doc = doc_idx + 1
+                while total < target:
+                    raw_next = self.indexed_dataset[next_doc % num_docs]
+                    need = target - total
+                    chunks.append(raw_next[:need])
+                    total += min(len(raw_next), need)
+                    next_doc += 1
+                sample = np.concatenate(chunks)[:target]
+            assert len(sample) == target
+            return sample.astype(np.int64)
+        else:
+            return np.arange(1, length, dtype=np.int64)
+
+
+class MockSFTDataset(SFTDataset):
+    """The mock dataset used during SFT"""
+
+    def __init__(
+        self,
+        dataset: LowLevelDataset,
+        dataset_path: Optional[str],
+        indices: np.ndarray,
+        num_samples: Optional[int],
+        index_split: Split,
+        config: GPTDatasetConfig,
+    ) -> None:
+        super().__init__(dataset, dataset_path, indices, num_samples, index_split, config)
+
+    @staticmethod
+    def build_low_level_dataset(dataset_path: str, config: GPTDatasetConfig) -> LowLevelDataset:
+        if config.sft_mock_dataset_config_json is None:
+            mock_config = {
+                    "mode": "distribution",
+                    "type": "lognormal",
+                    "min_seq_len": config.sequence_length // 2,
+                    "max_seq_len": config.sequence_length,
+                    "mean_seq_len": config.sequence_length // 4 * 3,
+                    "lognormal_sigma": 1.1,
+                }
+        else:
+            mock_config = json.loads(config.sft_mock_dataset_config_json)
+        return MockSFTLowLevelDataset(**mock_config)
+
+    def __len__(self) -> int:
+        return self.num_samples
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+
+        tokenizer = self.config.tokenizer
+        pack_length = self.config.sequence_length
+        eod = tokenizer.eod
+        pad = tokenizer.pad
+
+        tokens = self.dataset[int(self.indices[idx % len(self.indices)])]
+
+        # Convert tokens to list and always append EOD to ensure length consistency.
+        # The low-level dataset returns length-1 tokens, and we add EOD to make it length tokens.
+        tokens_list = tokens.tolist()
+        tokens_list.append(eod)
+
+        if self.dataset.format == "sbhd":
+            # SBHD format: single padded sequence without cu_seqlens.
+            # Long sequences are truncated to pack_length tokens (including EOD).
+            if len(tokens_list) >= pack_length + 1:
+                tokens_list = tokens_list[:pack_length - 1] + [eod]
+            # Pad to pack_length + 1 (offset by 1 for input/label split).
+            pad_len = pack_length + 1 - len(tokens_list)
+            if pad_len > 0:
+                tokens_list = tokens_list + [pad] * pad_len
+            assert len(tokens_list) == pack_length + 1
+            input_ids    = torch.tensor(tokens_list[:-1], dtype=torch.int64)
+            labels       = torch.tensor(tokens_list[1:],  dtype=torch.int64)
+            # Position IDs are sequential across the entire sequence including padding,
+            # matching GPTDataset behavior for standard (non-packed) training.
+            position_ids = torch.arange(pack_length, dtype=torch.int64)
+            loss_mask = torch.ones(pack_length, dtype=torch.float32)
+            loss_mask[labels == pad] = 0.0
+            return {
+                'tokens':       input_ids,
+                'labels':       labels,
+                'loss_mask':    loss_mask,
+                'position_ids': position_ids,
+            }
+
+        # THD format (sequence packing) below.
+        def extend_with_padding(tokens, positions, pad_len):
+            tokens.extend([pad] * pad_len)
+            positions.extend(range(positions[-1] + 1, positions[-1] + 1 + pad_len))
+
+        pack_tokens = list(tokens_list) + [pad]
+        pack_positions = list(range(len(pack_tokens)))
+
+        # Truncate if sequence exceeds pack_length + 1 (need +1 for shift).
+        if len(pack_tokens) > pack_length + 1:
+            pack_tokens = pack_tokens[:pack_length - 1] + [eod, pad]
+            pack_positions = pack_positions[:pack_length + 1]
+
+        # Pad to pad_granularity alignment (tp * cp * 2).
+        # We need final length (after shift) to be divisible by pad_granularity.
+        pad_granularity = self._calculate_padding_divisor()
+        final_len = len(pack_tokens) - 1
+        mod_token_count = final_len % pad_granularity
+        if mod_token_count != 0:
+            pad_len = pad_granularity - mod_token_count
+            extend_with_padding(pack_tokens, pack_positions, pad_len)
+
+        # Apply shift for next-token prediction.
+        input_ids = torch.tensor(pack_tokens[:-1], dtype=torch.int64)
+        labels = torch.tensor(pack_tokens[1:], dtype=torch.int64)
+        position_ids = torch.tensor(pack_positions[:-1], dtype=torch.int64)
+
+        seq_len = len(input_ids)
+        cu_seqlens = [0, seq_len]
+
+        # Loss mask: mask padding tokens
+        loss_mask = torch.ones(seq_len, dtype=torch.float32)
+        loss_mask[labels == pad] = 0.0
+
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32)
+        max_seqlen = torch.tensor(seq_len, dtype=torch.int32)
+
+        return {
+            'tokens': input_ids,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'position_ids': position_ids,
+            'cu_seqlens': cu_seqlens,
+            'max_seqlen': max_seqlen,
+        }
diff --git a/megatron/training/dgrad_logging.py b/megatron/training/dgrad_logging.py
index 4fcfead6d22..a2d1918951a 100644
--- a/megatron/training/dgrad_logging.py
+++ b/megatron/training/dgrad_logging.py
@@ -3,6 +3,7 @@
 """dgrad logging using backward hooks."""
 
 from collections import defaultdict
+
 import torch
 import torch.nn as nn
 
@@ -20,26 +21,36 @@ def _get_linear_types():
     # Add Transformer Engine layers if available.
     try:
         from megatron.core.extensions.transformer_engine import (
+            TEColumnParallelLinear,
+            TELayerNormColumnParallelLinear,
             TELinear,
             TENorm,
-            TEColumnParallelLinear,
             TERowParallelLinear,
-            TELayerNormColumnParallelLinear,
         )
-        types.extend([TELinear, TENorm, TEColumnParallelLinear, TERowParallelLinear,
-                      TELayerNormColumnParallelLinear])
+
+        types.extend(
+            [
+                TELinear,
+                TENorm,
+                TEColumnParallelLinear,
+                TERowParallelLinear,
+                TELayerNormColumnParallelLinear,
+            ]
+        )
     except ImportError:
         pass
 
     try:
         from megatron.core.extensions.transformer_engine import (
-            TEGroupedLinear,
             TEColumnParallelGroupedLinear,
+            TEGroupedLinear,
             TERowParallelGroupedLinear,
         )
+
         if TEGroupedLinear is not None:
-            types.extend([TEGroupedLinear, TEColumnParallelGroupedLinear,
-                          TERowParallelGroupedLinear])
+            types.extend(
+                [TEGroupedLinear, TEColumnParallelGroupedLinear, TERowParallelGroupedLinear]
+            )
     except ImportError:
         pass
 
@@ -51,7 +62,7 @@ def _get_linear_types():
 
 class DataGradLogger:
     """Captures and saves gradients from all linear layers using backward hooks.
-    
+
     NOTE: Right now, we only save the dgrads for the last microbatch in a batch on DP replica 0.
     The code below would need to be extended to save dgrads for all microbatches in a batch."""
 
@@ -62,6 +73,7 @@ def __init__(self, save_dir: str):
 
     def _make_hook(self, model_chunk_name: str, module_name: str):
         """Create a backward hook for a named module."""
+
         def hook(_, grad_input, grad_output):
             for idx, grad in enumerate(grad_output):
                 if grad is not None:
@@ -71,6 +83,7 @@ def hook(_, grad_input, grad_output):
                 if grad is not None:
                     key = f"{module_name}/input{idx}"
                     self._dgrads_state_dict[model_chunk_name][key] = grad.detach().cpu()
+
         return hook
 
     def save(self, iteration: int):
diff --git a/megatron/training/global_vars.py b/megatron/training/global_vars.py
index 6db19d428cc..ac16e1db01b 100644
--- a/megatron/training/global_vars.py
+++ b/megatron/training/global_vars.py
@@ -5,15 +5,18 @@
 import os
 import signal
 import sys
-import torch
-
 from datetime import timedelta
 
+import torch
+
 from megatron.core import Timers
 from megatron.core.config import set_experimental_flag
 from megatron.core.energy_monitor import EnergyMonitor
 from megatron.core.jit import disable_jit_fuser
-from megatron.core.num_microbatches_calculator import init_num_microbatches_calculator, unset_num_microbatches_calculator
+from megatron.core.num_microbatches_calculator import (
+    init_num_microbatches_calculator,
+    unset_num_microbatches_calculator,
+)
 from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer
 from megatron.training.dist_signal_handler import DistributedSignalHandler
 
@@ -27,6 +30,7 @@
 _GLOBAL_ENERGY_MONITOR = None
 _GLOBAL_SIGNAL_HANDLER = None
 
+
 def get_args():
     """Return arguments."""
     _ensure_var_is_initialized(_GLOBAL_ARGS, 'args')
@@ -56,6 +60,7 @@ def get_one_logger():
     to check if it is initialized."""
     return _GLOBAL_ONE_LOGGER
 
+
 def get_adlr_autoresume():
     """ADLR autoresume object. It can be None so no need
     to check if it is initialized."""
@@ -67,11 +72,13 @@ def get_timers():
     _ensure_var_is_initialized(_GLOBAL_TIMERS, 'timers')
     return _GLOBAL_TIMERS
 
+
 def get_energy_monitor():
     """Return energy monitor."""
     _ensure_var_is_initialized(_GLOBAL_ENERGY_MONITOR, 'energy monitor')
     return _GLOBAL_ENERGY_MONITOR
 
+
 def get_signal_handler():
     _ensure_var_is_initialized(_GLOBAL_SIGNAL_HANDLER, 'signal handler')
     return _GLOBAL_SIGNAL_HANDLER
@@ -95,6 +102,7 @@ def _graceful_shutdown(signum, frame):
       - Exits the process cleanly
     """
     from megatron.training.utils import print_rank_0
+
     print_rank_0("\nTermination requested. Performing orderly shutdown.")
 
     try:
@@ -209,39 +217,48 @@ def rebuild_tokenizer(args):
 def _set_tensorboard_writer(args):
     """Set tensorboard writer."""
     global _GLOBAL_TENSORBOARD_WRITER
-    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER,
-                                   'tensorboard writer')
+    _ensure_var_is_not_initialized(_GLOBAL_TENSORBOARD_WRITER, 'tensorboard writer')
 
-    if hasattr(args, 'tensorboard_dir') and \
-       args.tensorboard_dir and args.rank == (args.world_size - 1):
+    if (
+        hasattr(args, 'tensorboard_dir')
+        and args.tensorboard_dir
+        and args.rank == (args.world_size - 1)
+    ):
         try:
             from torch.utils.tensorboard import SummaryWriter
+
             print('> setting tensorboard ...')
             _GLOBAL_TENSORBOARD_WRITER = SummaryWriter(
-                log_dir=args.tensorboard_dir,
-                max_queue=args.tensorboard_queue_size)
+                log_dir=args.tensorboard_dir, max_queue=args.tensorboard_queue_size
+            )
         except ModuleNotFoundError:
-            print('WARNING: TensorBoard writing requested but is not '
-                  'available (are you using PyTorch 1.1.0 or later?), '
-                  'no TensorBoard logs will be written.', flush=True)
+            print(
+                'WARNING: TensorBoard writing requested but is not '
+                'available (are you using PyTorch 1.1.0 or later?), '
+                'no TensorBoard logs will be written.',
+                flush=True,
+            )
 
 
 def _set_wandb_writer(args):
     global _GLOBAL_WANDB_WRITER
-    _ensure_var_is_not_initialized(_GLOBAL_WANDB_WRITER,
-                                   'wandb writer')
+    _ensure_var_is_not_initialized(_GLOBAL_WANDB_WRITER, 'wandb writer')
     if getattr(args, 'wandb_project', '') and args.rank == (args.world_size - 1):
         if args.wandb_exp_name == '':
             raise ValueError("Please specify the wandb experiment name!")
 
         import wandb
+
         if args.wandb_save_dir:
             save_dir = args.wandb_save_dir
         else:
             # Defaults to the save dir.
             save_dir = os.path.join(args.save, 'wandb')
         wandb_config = vars(args)
-        if 'kitchen_config_file' in wandb_config and wandb_config['kitchen_config_file'] is not None:
+        if (
+            'kitchen_config_file' in wandb_config
+            and wandb_config['kitchen_config_file'] is not None
+        ):
             # Log the contents of the config for discovery of what the quantization
             # settings were.
             with open(wandb_config['kitchen_config_file'], "r") as f:
@@ -250,7 +267,8 @@ def _set_wandb_writer(args):
             'dir': save_dir,
             'name': args.wandb_exp_name,
             'project': args.wandb_project,
-            'config': wandb_config}
+            'config': wandb_config,
+        }
         if args.wandb_entity:
             wandb_kwargs['entity'] = args.wandb_entity
         os.makedirs(wandb_kwargs['dir'], exist_ok=True)
@@ -269,18 +287,22 @@ def _set_one_logger(args):
             one_logger_async = False
         try:
             from one_logger import OneLogger
+
             config = {
-               'project': args.one_logger_project,
-               'name': args.one_logger_run_name,
-               'async': one_logger_async,
+                'project': args.one_logger_project,
+                'name': args.one_logger_run_name,
+                'async': one_logger_async,
             }
             one_logger = OneLogger(config=config)
             _GLOBAL_ONE_LOGGER = one_logger
         except Exception:
-            print('WARNING: one_logger package is required to enable e2e metrics '
-                  'tracking. please go to '
-                  'https://confluence.nvidia.com/display/MLWFO/Package+Repositories'
-                  ' for details to install it')
+            print(
+                'WARNING: one_logger package is required to enable e2e metrics '
+                'tracking. please go to '
+                'https://confluence.nvidia.com/display/MLWFO/Package+Repositories'
+                ' for details to install it'
+            )
+
 
 def _set_adlr_autoresume(args):
     """Initialize ADLR autoresume."""
@@ -289,6 +311,7 @@ def _set_adlr_autoresume(args):
 
     if args.adlr_autoresume:
         from megatron.training.utils import print_rank_0
+
         print_rank_0('enabling autoresume ...')
         sys.path.append(os.environ.get('SUBMIT_SCRIPTS', '.'))
         try:
@@ -306,6 +329,7 @@ def _set_timers(args):
     _ensure_var_is_not_initialized(_GLOBAL_TIMERS, 'timers')
     _GLOBAL_TIMERS = Timers(args.timing_log_level, args.timing_log_option)
 
+
 def _set_energy_monitor(args):
     """Initialize energy monitor."""
     global _GLOBAL_ENERGY_MONITOR
@@ -322,6 +346,7 @@ def _ensure_var_is_not_initialized(var, name):
     """Make sure the input variable is not None."""
     assert var is None, '{} is already initialized.'.format(name)
 
+
 def destroy_global_vars():
     global _GLOBAL_ARGS
     _GLOBAL_ARGS = None
diff --git a/megatron/training/initialize.py b/megatron/training/initialize.py
index ff655502019..5707b4732ae 100644
--- a/megatron/training/initialize.py
+++ b/megatron/training/initialize.py
@@ -165,6 +165,7 @@ def _compile_dependencies():
 
     torch.distributed.barrier()
 
+
 def _initialize_tp_communicators():
     """initializing the communicators with user buffers for high-performance tensor-model-parallel
     communication overlap"""
@@ -346,7 +347,8 @@ def _initialize_distributed(get_embedding_ranks, get_position_embedding_ranks, s
                 use_sharp=args.use_sharp,
                 context_parallel_size=args.context_parallel_size,
                 hierarchical_context_parallel_sizes=args.hierarchical_context_parallel_sizes,
-                hybrid_context_parallel=args.hybrid_context_parallel,
+                dynamic_context_parallel=args.dynamic_context_parallel,
+                min_dynamic_context_parallel_size=args.min_dynamic_context_parallel_size,
                 expert_model_parallel_size=args.expert_model_parallel_size,
                 num_distributed_optimizer_instances=args.num_distributed_optimizer_instances,
                 expert_tensor_parallel_size=args.expert_tensor_parallel_size,
diff --git a/megatron/training/inprocess_restart.py b/megatron/training/inprocess_restart.py
index ac377de4ac0..fdfc7fd3cbe 100644
--- a/megatron/training/inprocess_restart.py
+++ b/megatron/training/inprocess_restart.py
@@ -15,18 +15,18 @@
 
 from megatron.core import rerun_state_machine
 from megatron.training import get_args
-from megatron.training.async_utils import (
-    reset_persistent_async_worker,
-)
+from megatron.training.async_utils import reset_persistent_async_worker
 
 from . import arguments
 
 
 def destroy_state():
     from . import training
+
     training.destroy_global_state()
     rerun_state_machine.destroy_rerun_state_machine()
 
+
 def inprocess_restart(train, args):
     if inprocess is None:
         warnings.warn('In-process restart is not available')
@@ -82,9 +82,8 @@ def inprocess_restart(train, args):
     class AbortCheckpoint(inprocess.abort.Abort):
         def __init__(self, async_strategy):
             self.async_strategy = async_strategy
-        def __call__(
-            self, state: inprocess.state.FrozenState
-        ) -> inprocess.state.FrozenState:
+
+        def __call__(self, state: inprocess.state.FrozenState) -> inprocess.state.FrozenState:
             reset_persistent_async_worker(self.async_strategy)
             return state
 
@@ -134,7 +133,7 @@ def maybe_wrap_for_inprocess_restart(pretrain):
 
         store = torch.distributed.TCPStore(
             host_name=os.environ['MASTER_ADDR'],
-            port=int(os.environ['MASTER_PORT'])+1,
+            port=int(os.environ['MASTER_PORT']) + 1,
             world_size=int(os.getenv('WORLD_SIZE', '1')),
             is_master=(int(os.getenv('RANK', '0')) == 0),
             timeout=timedelta(seconds=300),
diff --git a/megatron/training/training.py b/megatron/training/training.py
index d8825a7b52f..8dd021db927 100644
--- a/megatron/training/training.py
+++ b/megatron/training/training.py
@@ -2,14 +2,15 @@
 
 """Pretrain utilities."""
 import time
+
 # The earliest we can measure the start time.
 _TRAIN_START_TIME = time.time()
 
 # Startup timestamps for tracking program initialization phases
 _STARTUP_TIMESTAMPS = {
     'program_start': None,  # Set by entry script before imports
-    'main_entry': None,     # Set by entry script at start of __main__
-    'pretrain_entry': None, # Set at top of pretrain()
+    'main_entry': None,  # Set by entry script at start of __main__
+    'pretrain_entry': None,  # Set at top of pretrain()
 }
 
 
@@ -31,10 +32,8 @@ def set_startup_timestamps(program_start=None, main_entry=None):
         _STARTUP_TIMESTAMPS['main_entry'] = main_entry
 
 
-from collections import defaultdict
 import copy
 import dataclasses
-from datetime import datetime, timedelta
 import functools
 import gc
 import inspect
@@ -42,26 +41,30 @@ def set_startup_timestamps(program_start=None, main_entry=None):
 import math
 import os
 import sys
+from collections import defaultdict
 from contextlib import nullcontext
+from datetime import datetime, timedelta
 from pathlib import Path
-from typing import Any, Optional, Dict
+from typing import Any, Dict, Optional
 
 import torch.distributed
 
 from megatron.core.optimizer.distrib_optimizer import DistributedOptimizer
 from megatron.core.optimizer_param_scheduler import get_canonical_lr_for_logging
+
 from .log_handler import CustomHandler
 
 # Make default logging level INFO, but filter out all log messages not from MCore.
 logging.basicConfig(handlers=[CustomHandler()], level=logging.INFO)
 from .theoretical_memory_usage import report_theoretical_memory
 
-_LEGACY_TRAIN_START_TIME = time.time() # NOTE(asolergi-nv): Legacy timestamp
+_LEGACY_TRAIN_START_TIME = time.time()  # NOTE(asolergi-nv): Legacy timestamp
 
 import torch
 
 try:
     from megatron.rl import rl_utils
+
     has_rl_utils = True
 except ImportError:
     has_rl_utils = False
@@ -116,9 +119,7 @@ def set_startup_timestamps(program_start=None, main_entry=None):
 ]
 
 try:
-    from modelopt.torch.distill.plugins.megatron import (
-        get_tensor_shapes_adjust_fn_for_distillation,
-    )
+    from modelopt.torch.distill.plugins.megatron import get_tensor_shapes_adjust_fn_for_distillation
 
     has_nvidia_modelopt = True
 except ImportError:
@@ -131,42 +132,50 @@ def set_startup_timestamps(program_start=None, main_entry=None):
 
 
 from megatron.core import mpu, tensor_parallel
-from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
-    is_linear_attention_variant,
+from megatron.core.distributed import DistributedDataParallel as DDP
+from megatron.core.distributed import (
+    DistributedDataParallelConfig,
+    TorchFullyShardedDataParallelConfig,
 )
-from megatron.core.utils import (
-    check_param_hashes_across_dp_replicas,
-    configure_nvtx_profiling,
-    get_attr_wrapped_model,
-    get_model_config,
-    get_pg_size,
-    get_pg_rank,
-    StragglerDetector,
+from megatron.core.distributed.fsdp.mcore_fsdp_adapter import (
+    FullyShardedDataParallel as megatron_FSDP,
 )
 from megatron.core.fp8_utils import correct_amax_history_if_needed
-from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.full_cuda_graph import FullCudaGraphWrapper
+from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+    is_linear_attention_variant,
+)
+from megatron.core.optimizer import get_mup_config_overrides, get_standard_config_overrides
+from megatron.core.optimizer.optimizer import param_group_identifier_keys
+from megatron.core.optimizer.optimizer_cuda_graph import OptimizerCudaGraphWrapper
+from megatron.core.optimizer.qk_clip import clip_qk
 from megatron.core.pipeline_parallel.utils import (
     is_pp_first_stage,
     is_pp_last_stage,
     is_vp_first_stage,
     is_vp_last_stage,
 )
-from megatron.core.optimizer import get_mup_config_overrides, get_standard_config_overrides
-from megatron.training.checkpointing import load_checkpoint
-from megatron.training.checkpointing import save_checkpoint, save_grads
-from megatron.training.checkpointing import checkpoint_exists
-from megatron.training.checkpointing import get_loaded_iteration
-from megatron.core.full_cuda_graph import FullCudaGraphWrapper
-from megatron.core.optimizer.optimizer_cuda_graph import OptimizerCudaGraphWrapper
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.cuda_graphs import TECudaGraphHelper
 from megatron.core.transformer.enums import CudaGraphScope
 from megatron.core.transformer.module import Float16Module
-from megatron.core.distributed import DistributedDataParallelConfig, TorchFullyShardedDataParallelConfig
-from megatron.core.distributed import DistributedDataParallel as DDP
-from megatron.core.distributed.fsdp.mcore_fsdp_adapter import FullyShardedDataParallel as megatron_FSDP
-from megatron.core.optimizer.optimizer import param_group_identifier_keys
-
-from megatron.core.optimizer.qk_clip import clip_qk
+from megatron.core.transformer.moe.paged_stash import PagedStashRunner
+from megatron.core.utils import (
+    StragglerDetector,
+    check_param_hashes_across_dp_replicas,
+    configure_nvtx_profiling,
+    get_attr_wrapped_model,
+    get_model_config,
+    get_pg_rank,
+    get_pg_size,
+)
+from megatron.training.checkpointing import (
+    checkpoint_exists,
+    get_loaded_iteration,
+    load_checkpoint,
+    save_checkpoint,
+    save_grads,
+)
 
 try:
     from megatron.core.distributed import TorchFullyShardedDataParallel as torch_FSDP
@@ -177,92 +186,96 @@ def set_startup_timestamps(program_start=None, main_entry=None):
 
 from megatron.core.distributed import finalize_model_grads
 from megatron.core.enums import ModelType
-from megatron.core.optimizer import (
-    get_megatron_optimizer,
-    OptimizerConfig,
-    ParamKey,
+from megatron.core.inference.symmetric_memory import SymmetricMemoryManager
+from megatron.core.inference.unified_memory import create_unified_mempool
+from megatron.core.optimizer import OptimizerConfig, ParamKey, get_megatron_optimizer
+from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler
+from megatron.core.parallel_state import (
+    create_all_gather_groups,
+    destroy_global_memory_buffer,
+    destroy_model_parallel,
+    update_pg_timeout,
 )
 from megatron.core.rerun_state_machine import (
-    get_rerun_state_machine,
-    destroy_rerun_state_machine,
     RerunDataIterator,
     RerunMode,
+    destroy_rerun_state_machine,
+    get_rerun_state_machine,
 )
-from megatron.training.initialize import initialize_megatron
-from megatron.training.initialize import write_args_to_tensorboard
-from megatron.training.initialize import set_jit_fusion_options
-from megatron.training.utils import get_batch_on_this_cp_rank, get_batch_on_this_tp_rank, is_hybrid_model
-from megatron.training.datasets.data_samplers import build_pretraining_data_loader
-from megatron.core.datasets.data_schedule import HybridCPDataLoaderWrapper
-from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler
-from megatron.core.transformer.moe import upcycling_utils
-from megatron.core.transformer.moe.moe_utils import track_moe_metrics, clear_aux_losses_tracker
+from megatron.core.resharding.refit import swap_model_weights
 from megatron.core.transformer.experimental_attention_variant.dsa import DSAIndexerLossLoggingHelper
+from megatron.core.transformer.moe import upcycling_utils
+from megatron.core.transformer.moe.moe_logging import (
+    get_moe_metrics_tracker,
+    get_moe_overload_factor_tracker,
+)
 from megatron.core.transformer.multi_token_prediction import MTPLossLoggingHelper
-from megatron.core.parallel_state import (
-    destroy_global_memory_buffer,
-    destroy_model_parallel,
-    update_pg_timeout,
-    create_all_gather_groups,
+from megatron.training.datasets.data_samplers import build_pretraining_data_loader
+from megatron.training.initialize import (
+    initialize_megatron,
+    set_jit_fusion_options,
+    write_args_to_tensorboard,
+)
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+    is_hybrid_model,
 )
-from megatron.core.inference.symmetric_memory import SymmetricMemoryManager
-from megatron.core.inference.unified_memory import create_unified_mempool
-from megatron.core.resharding.refit import swap_model_weights
 
 try:
     from torch_memory_saver import torch_memory_saver
+
     torch_memory_saver.hook_mode = "torch"
     HAVE_TORCH_MEMORY_SAVER = True
 except ImportError:
     HAVE_TORCH_MEMORY_SAVER = False
 
-from megatron.core.pipeline_parallel import get_forward_backward_func
+from megatron.core.datasets.data_schedule import wrap_data_iterator
 from megatron.core.num_microbatches_calculator import (
     destroy_num_microbatches_calculator,
     get_current_global_batch_size,
     get_current_running_global_batch_size,
     get_num_microbatches,
-    update_num_microbatches
+    update_num_microbatches,
 )
+from megatron.core.pipeline_parallel import get_forward_backward_func
 
+from . import ft_integration, one_logger_utils
+from .activation_logging import (
+    disable_activation_logging,
+    disable_tokens_per_expert_logging,
+    enable_activation_logging,
+    enable_tokens_per_expert_logging,
+    save_activations,
+    save_tokens_per_expert,
+)
 from .async_utils import maybe_finalize_async_save
+from .dgrad_logging import disable_dgrad_logging, enable_dgrad_logging, save_dgrads
+from .global_vars import (
+    destroy_global_vars,
+    get_args,
+    get_energy_monitor,
+    get_one_logger,
+    get_signal_handler,
+    get_tensorboard_writer,
+    get_timers,
+    get_tokenizer,
+    get_wandb_writer,
+)
 from .utils import (
     append_to_progress_log,
     calc_params_l2_norm,
     check_adlr_autoresume_termination,
-    logical_and_across_model_parallel_group,
-    reduce_max_stat_across_model_parallel_group,
     is_last_rank,
+    logical_and_across_model_parallel_group,
     print_rank_0,
     print_rank_last,
+    reduce_max_stat_across_model_parallel_group,
     report_memory,
+    to_empty_if_meta_device,
     unwrap_model,
     update_use_dist_ckpt,
-    to_empty_if_meta_device,
 )
-from .global_vars import (
-    destroy_global_vars,
-    get_args,
-    get_signal_handler,
-    get_timers,
-    get_tensorboard_writer,
-    get_wandb_writer,
-    get_one_logger,
-    get_tokenizer,
-    get_energy_monitor,
-)
-from . import one_logger_utils
-from .activation_logging import (
-    enable_activation_logging,
-    disable_activation_logging,
-    save_activations,
-    enable_tokens_per_expert_logging,
-    disable_tokens_per_expert_logging,
-    save_tokens_per_expert,
-)
-from .dgrad_logging import enable_dgrad_logging, disable_dgrad_logging, save_dgrads
-
-from . import ft_integration
 
 stimer = StragglerDetector()
 
@@ -280,7 +293,7 @@ def destroy_global_state():
 
 def print_datetime(string, override_timestamp=None):
     """Note that this call will sync across all ranks. Use override_timestamp if provided;
-       otherwise use current timestamp."""
+    otherwise use current timestamp."""
     torch.distributed.barrier()
     if override_timestamp is None:
         time_str = datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
@@ -288,46 +301,113 @@ def print_datetime(string, override_timestamp=None):
         time_str = datetime.fromtimestamp(override_timestamp).strftime('%Y-%m-%d %H:%M:%S.%f')
     print_rank_0(f'[{string}] datetime: {time_str} ')
 
-def num_floating_point_operations(args, batch_size):
-    def mlp_layer_flops(batch_size, seq_len, hidden_size, expansion=4.0, swiglu=False):
+
+def num_floating_point_operations(
+    args, seqlen_sum_this_global_batch, seqlen_squared_sum_this_global_batch
+):
+    def calculate_layer_counts():
+        """Calculate the number of attention, Mamba, and MLP layers."""
+        if args.hybrid_override_pattern:
+            from megatron.core.ssm.mamba_hybrid_layer_allocation import parse_hybrid_pattern
+
+            # Parse unified pattern to separate main and MTP components
+            parsed = parse_hybrid_pattern(args.hybrid_override_pattern)
+            counts = {'M': 0, '*': 0, '-': 0, 'E': 0}
+            # Count main decoder layers
+            if parsed.main_pattern:
+                for layer_type in parsed.main_pattern:
+                    if layer_type in counts:
+                        counts[layer_type] += 1
+            # Count MTP layers (pattern repeated mtp_num_depths times)
+            if parsed.mtp_pattern and parsed.mtp_num_depths > 0:
+                for layer_type in parsed.mtp_pattern:
+                    if layer_type in counts:
+                        counts[layer_type] += parsed.mtp_num_depths
+            return counts['*'], counts['M'], counts['-'], counts['E']
+        else:
+            num_attn_layers = round(args.num_layers * args.hybrid_attention_ratio)
+            num_mlp_layers = round(args.num_layers * args.hybrid_mlp_ratio)
+            num_mamba_layers = args.num_layers - num_attn_layers - num_mlp_layers
+            num_moe_layers = 0
+            return num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers
+
+    def mlp_layer_flops(seqlen_sum_this_global_batch, hidden_size, expansion=4.0, swiglu=False):
         """Calculate FLOPs for an MLP layer."""
         scale_factor = 3.0 / 2.0 if swiglu else 1.0
-        return 4 * expansion * scale_factor * batch_size * seq_len * hidden_size**2
-
-    def moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size,
-                        shared_expert_ffn_hidden_size, num_experts_routed_to,
-                        moe_latent_size=None, swiglu=False):
+        return 4 * expansion * scale_factor * seqlen_sum_this_global_batch * hidden_size**2
+
+    def moe_layer_flops(
+        seqlen_sum_this_global_batch,
+        hidden_size,
+        moe_ffn_hidden_size,
+        shared_expert_ffn_hidden_size,
+        num_experts_routed_to,
+        moe_latent_size=None,
+        swiglu=False,
+    ):
         """Calculate FLOPs for an MoE layer."""
         scale_factor = 3.0 / 2.0 if swiglu else 1.0
         if moe_latent_size is None:
-            routed_flops = (4 * batch_size * seq_len * hidden_size *
-                            moe_ffn_hidden_size * num_experts_routed_to * scale_factor)
+            routed_flops = (
+                4
+                * seqlen_sum_this_global_batch
+                * hidden_size
+                * moe_ffn_hidden_size
+                * num_experts_routed_to
+                * scale_factor
+            )
         else:
             # Routed experts run on moe_latent_size.
-            routed_flops = (4 * batch_size * seq_len * moe_latent_size *
-                            moe_ffn_hidden_size * num_experts_routed_to * scale_factor)
+            routed_flops = (
+                4
+                * seqlen_sum_this_global_batch
+                * moe_latent_size
+                * moe_ffn_hidden_size
+                * num_experts_routed_to
+                * scale_factor
+            )
             # Up proj and down proj.
-            routed_flops += (4 * batch_size * seq_len * hidden_size * moe_latent_size)
-        shared_flops = 4 * batch_size * seq_len * hidden_size * shared_expert_ffn_hidden_size * scale_factor
+            routed_flops += 4 * seqlen_sum_this_global_batch * hidden_size * moe_latent_size
+        shared_flops = (
+            4
+            * seqlen_sum_this_global_batch
+            * hidden_size
+            * shared_expert_ffn_hidden_size
+            * scale_factor
+        )
         return routed_flops + shared_flops
 
     def attn_layer_flops(
-        batch_size, seq_len, hidden_size, num_heads, gqa=True, gqa_groups=8, kv_channels=None
+        seqlen_sum_this_global_batch,
+        seqlen_squared_sum_this_global_batch,
+        hidden_size,
+        num_heads,
+        gqa=True,
+        gqa_groups=8,
+        kv_channels=None,
     ):
         """Calculate FLOPs for an attention layer."""
         p = (kv_channels * num_heads / hidden_size) if kv_channels else 1
         g = gqa_groups if gqa else num_heads
         return (
             4
-            * batch_size
-            * seq_len
             * hidden_size
             * p
-            * (hidden_size + (hidden_size * (g / num_heads)) + (seq_len / 2))
+            * (
+                hidden_size * seqlen_sum_this_global_batch
+                + (hidden_size * (g / num_heads)) * seqlen_sum_this_global_batch
+                + (seqlen_squared_sum_this_global_batch / 2)
+            )
         )
 
-    def mamba_layer_flops(batch_size, seq_len, hidden_size, state_dim=16,
-                          head_dim=64, num_groups=1, num_heads=128):
+    def mamba_layer_flops(
+        seqlen_sum_this_global_batch,
+        hidden_size,
+        state_dim=16,
+        head_dim=64,
+        num_groups=1,
+        num_heads=128,
+    ):
         """Calculate FLOPs for a Mamba layer."""
         # Note (rwaleffe): flops estimate for scan should be updated based on new SSD kernels,
         # but small percent of overall layer flops
@@ -339,66 +419,118 @@ def mamba_layer_flops(batch_size, seq_len, hidden_size, state_dim=16,
         return (
             (
                 2
-                * batch_size
-                * seq_len
+                * seqlen_sum_this_global_batch
                 * hidden_size
                 * (2 * d_in + 2 * num_groups * state_dim + nheads)
             )  # in_proj
-            + (7 * batch_size * seq_len * d_in * state_dim)  # scan
-            + (2 * batch_size * seq_len * d_in * hidden_size)  # out_proj
+            + (7 * seqlen_sum_this_global_batch * d_in * state_dim)  # scan
+            + (2 * seqlen_sum_this_global_batch * d_in * hidden_size)  # out_proj
         )
 
-    def gdn_layer_flops(batch_size, seq_len, hidden_size,
-                        qk_head_dim=128, v_head_dim=128,
-                        num_qk_heads=16, num_v_heads=32,
-                        conv_kernel_dim=4):
+    def gdn_layer_flops(
+        seqlen_sum_this_global_batch,
+        hidden_size,
+        qk_head_dim=128,
+        v_head_dim=128,
+        num_qk_heads=16,
+        num_v_heads=32,
+        conv_kernel_dim=4,
+    ):
         """Calculate FLOPs for a Gated Delta Net (GDN) layer."""
         qk_dim = qk_head_dim * num_qk_heads
         v_dim = v_head_dim * num_v_heads
         return (
-            2 * batch_size * seq_len * (
+            2
+            * seqlen_sum_this_global_batch
+            * (
                 # in_proj: hidden_size -> (2*qk_dim + 2*v_dim + 2*num_v_heads)
                 hidden_size * (2 * qk_dim + 2 * v_dim + 2 * num_v_heads)
                 # conv1d
                 + conv_kernel_dim * (2 * qk_dim + v_dim)
                 # gated delta rule: KK^T, VK^T, S(a(I-bKK^T)), and SQ
-                + num_v_heads * (v_head_dim ** 2) * 4
+                + num_v_heads * (v_head_dim**2) * 4
                 # out_proj: v_dim -> hidden_size
                 + hidden_size * v_dim
             )
         )
 
-    def hybrid_flops(batch_size, seq_len, hidden_size,
-                     num_attn_layers, num_mamba_layers, num_mlp_layers, num_moe_layers,
-                     num_gdn_layers=0,
-                     mamba_state_dim=128, mamba_head_dim=64,
-                     mamba_num_groups=8, mamba_num_heads=128,
-                     num_attn_heads=32, gqa=True,
-                     gqa_groups=8, kv_channels=None,
-                     mlp_expansion=4.0, swiglu=False,
-                     moe_latent_size=None,
-                     moe_ffn_hidden_size=2048, shared_expert_ffn_hidden_size=2048, num_experts_routed_to=1,
-                     gdn_qk_head_dim=128, gdn_v_head_dim=128,
-                     gdn_num_qk_heads=16, gdn_num_v_heads=32,
-                     gdn_conv_kernel_dim=4,
-                     vocab_size=256000, mtp_num_layers=0):
+    def hybrid_flops(
+        seqlen_sum_this_global_batch,
+        seqlen_squared_sum_this_global_batch,
+        hidden_size,
+        num_attn_layers,
+        num_mamba_layers,
+        num_mlp_layers,
+        num_moe_layers,
+        num_gdn_layers=0,
+        mamba_state_dim=128,
+        mamba_head_dim=64,
+        mamba_num_groups=8,
+        mamba_num_heads=128,
+        num_attn_heads=32,
+        gqa=True,
+        gqa_groups=8,
+        kv_channels=None,
+        mlp_expansion=4.0,
+        swiglu=False,
+        moe_latent_size=None,
+        moe_ffn_hidden_size=2048,
+        shared_expert_ffn_hidden_size=2048,
+        num_experts_routed_to=1,
+        gdn_qk_head_dim=128,
+        gdn_v_head_dim=128,
+        gdn_num_qk_heads=16,
+        gdn_num_v_heads=32,
+        gdn_conv_kernel_dim=4,
+        vocab_size=256000,
+        mtp_num_layers=0,
+    ):
         """Calculate total FLOPs for the hybrid model."""
         flops_fwd = (
-                num_attn_layers * attn_layer_flops(batch_size, seq_len, hidden_size,
-                                                   num_attn_heads, gqa, gqa_groups, kv_channels) +
-                num_mlp_layers * mlp_layer_flops(batch_size, seq_len, hidden_size,
-                                                 mlp_expansion, swiglu) +
-                num_mamba_layers * mamba_layer_flops(batch_size, seq_len, hidden_size,
-                                                     mamba_state_dim, mamba_head_dim,
-                                                     mamba_num_groups, mamba_num_heads) +
-                num_moe_layers * moe_layer_flops(batch_size, seq_len, hidden_size, moe_ffn_hidden_size,
-                                                 shared_expert_ffn_hidden_size, num_experts_routed_to,
-                                                 moe_latent_size, swiglu) +
-                num_gdn_layers * gdn_layer_flops(batch_size, seq_len, hidden_size,
-                                                  gdn_qk_head_dim, gdn_v_head_dim,
-                                                  gdn_num_qk_heads, gdn_num_v_heads,
-                                                  gdn_conv_kernel_dim) +
-                (2 * batch_size * seq_len * hidden_size * vocab_size * (1 + mtp_num_layers))  # logits computation
+            num_attn_layers
+            * attn_layer_flops(
+                seqlen_sum_this_global_batch,
+                seqlen_squared_sum_this_global_batch,
+                hidden_size,
+                num_attn_heads,
+                gqa,
+                gqa_groups,
+                kv_channels,
+            )
+            + num_mlp_layers
+            * mlp_layer_flops(seqlen_sum_this_global_batch, hidden_size, mlp_expansion, swiglu)
+            + num_mamba_layers
+            * mamba_layer_flops(
+                seqlen_sum_this_global_batch,
+                hidden_size,
+                mamba_state_dim,
+                mamba_head_dim,
+                mamba_num_groups,
+                mamba_num_heads,
+            )
+            + num_moe_layers
+            * moe_layer_flops(
+                seqlen_sum_this_global_batch,
+                hidden_size,
+                moe_ffn_hidden_size,
+                shared_expert_ffn_hidden_size,
+                num_experts_routed_to,
+                moe_latent_size,
+                swiglu,
+            )
+            + num_gdn_layers
+            * gdn_layer_flops(
+                seqlen_sum_this_global_batch,
+                hidden_size,
+                gdn_qk_head_dim,
+                gdn_v_head_dim,
+                gdn_num_qk_heads,
+                gdn_num_v_heads,
+                gdn_conv_kernel_dim,
+            )
+            + (
+                2 * seqlen_sum_this_global_batch * hidden_size * vocab_size * (1 + mtp_num_layers)
+            )  # logits computation
         )
         return flops_fwd * 3
 
@@ -469,13 +601,18 @@ def transformer_flops():
             assert not args.group_query_attention
             '''
             Basic arithmetic
-            let B is batch size, s is seq_len, h is embedding dim,
-            for one self_attnetion block (prenorm is not included)
-            qkv projection:  6Bsh^2
-            attn:            2Bs^2h
-            attn over value: 2Bs^2h
-            oproj:           2Bsh^2
-
+            
+            Let h be the embedding dim.
+            We use two statistics to unify BSHD and THD cases:
+                seqlen_sum_this_global_batch: total number of tokens in this global batch
+                seqlen_squared_sum_this_global_batch: sum of squared sequence lengths in this global batch
+
+            For one self-attention block (prenorm not included):
+                qkv projection:      6 * seqlen_sum_this_global_batch * h^2
+                attn:    2 * seqlen_squared_sum_this_global_batch * h
+                attn over value:   2 * seqlen_squared_sum_this_global_batch * h
+                oproj:   2 * seqlen_sum_this_global_batch * h^2
+            
             references
             https://arxiv.org/abs/2305.10403
             https://arxiv.org/abs/2205.05198
@@ -497,23 +634,29 @@ def transformer_flops():
                 forward_backward_expansion_factor
                 * fma_expansion_factor
                 * (
-                    ## q lora + rope + q norm
-                    q_term
-                    ## kv lora + rope + kv norm
-                    + args.kv_lora_rank
+                    seqlen_sum_this_global_batch
                     * (
-                        args.hidden_size
-                        + args.num_attention_heads * (args.qk_head_dim + args.v_head_dim)
-                        + 1
+                        ## q lora + rope + q norm
+                        q_term
+                        ## kv lora + rope + kv norm
+                        + args.kv_lora_rank
+                        * (
+                            args.hidden_size
+                            + args.num_attention_heads * (args.qk_head_dim + args.v_head_dim)
+                            + 1
+                        )
+                        + args.hidden_size * args.qk_pos_emb_head_dim
+                        ## o proj
+                        + (args.num_attention_heads * args.v_head_dim) * args.hidden_size
                     )
-                    + args.hidden_size * args.qk_pos_emb_head_dim
-                    ## o proj
-                    + (args.num_attention_heads * args.v_head_dim) * args.hidden_size
                     ## core attn
-                    + args.seq_length
+                    + seqlen_squared_sum_this_global_batch
                     * (args.num_attention_heads * (args.qk_head_dim + args.qk_pos_emb_head_dim))
                     / 2  # causal mask (only half of the mask is non-zero)
-                    + args.seq_length * args.num_attention_heads * args.v_head_dim / 2
+                    + seqlen_squared_sum_this_global_batch
+                    * args.num_attention_heads
+                    * args.v_head_dim
+                    / 2
                 )
             )
 
@@ -527,22 +670,24 @@ def transformer_flops():
                 forward_backward_expansion_factor
                 * fma_expansion_factor
                 * (
-                    ## qkv proj
-                    args.hidden_size
+                    seqlen_sum_this_global_batch
                     * (
-                        query_projection_size
-                        + key_projection_size
-                        + value_projection_size
-                        + gate_projection_size
+                        ## qkv proj
+                        args.hidden_size
+                        * (
+                            query_projection_size
+                            + key_projection_size
+                            + value_projection_size
+                            + gate_projection_size
+                        )
                     )
                     ## core attention
                     + query_projection_size
-                    * args.seq_length
+                    * seqlen_squared_sum_this_global_batch
                     / 2  # causal mask (only half of the mask is non-zero)
                     * 2  # QK^T and (QK^T)V
                     ## out proj
-                    + query_projection_size
-                    * args.hidden_size
+                    + seqlen_sum_this_global_batch * query_projection_size * args.hidden_size
                 )
             )
 
@@ -551,8 +696,8 @@ def transformer_flops():
             if isinstance(args.linear_attention_freq, int):
                 linear_attention_pattern = [
                     # [1,1,...,1,0,1,1,...,1,0,...]
-                    0 if ((i + 1) % args.linear_attention_freq == 0)
-                    else 1 for i in range(num_layers)
+                    0 if ((i + 1) % args.linear_attention_freq == 0) else 1
+                    for i in range(num_layers)
                 ]
             elif isinstance(args.linear_attention_freq, list):
                 linear_attention_pattern = args.linear_attention_freq
@@ -589,20 +734,15 @@ def transformer_flops():
                     * fma_expansion_factor
                     * (
                         ## in proj
-                        args.hidden_size
-                        * (2 * qk_dim + 2 * v_dim + 2 * num_v_heads)
+                        args.hidden_size * (2 * qk_dim + 2 * v_dim + 2 * num_v_heads)
                         ## conv1d
-                        + args.linear_conv_kernel_dim
-                        * (2 * qk_dim + v_dim)
+                        + args.linear_conv_kernel_dim * (2 * qk_dim + v_dim)
                         ## gated delta rule
-                        + num_v_heads
-                        * (v_head_dim ** 2)
-                        * 4  # KK^T, VK^T, S(a(I-bKK^T)), and SQ
+                        + num_v_heads * (v_head_dim**2) * 4  # KK^T, VK^T, S(a(I-bKK^T)), and SQ
                         ## out proj
-                        + args.hidden_size
-                        * v_dim
+                        + args.hidden_size * v_dim
                     )
-                )
+                ) * seqlen_sum_this_global_batch
             else:
                 raise ValueError(
                     "Invalid experimental_attention_variant: "
@@ -619,8 +759,7 @@ def transformer_flops():
         )
 
         total_floating_point_operations = (
-            batch_size
-            * args.seq_length
+            seqlen_sum_this_global_batch
             * (
                 # MLP
                 forward_backward_expansion_factor
@@ -628,8 +767,7 @@ def transformer_flops():
                 * args.hidden_size
                 * (
                     # dense layer (deepseek v2, v3 style)
-                    (args.ffn_hidden_size * ffn_expansion_factor)
-                    * num_dense_layers
+                    (args.ffn_hidden_size * ffn_expansion_factor) * num_dense_layers
                     # routed experts
                     + (
                         (moe_ffn_hidden_size * num_experts_routed_to * ffn_expansion_factor)
@@ -647,11 +785,8 @@ def transformer_flops():
                     )
                     * num_moe_layers
                     # Shared Experts.
-                    + (shared_expert_ffn_hidden_size * ffn_expansion_factor)
-                    * num_moe_layers
+                    + (shared_expert_ffn_hidden_size * ffn_expansion_factor) * num_moe_layers
                 )
-                # Self Attention
-                + self_attn_term
                 # MTP norms and proj
                 + forward_backward_expansion_factor
                 * fma_expansion_factor
@@ -669,6 +804,9 @@ def transformer_flops():
                 * args.padded_vocab_size
                 * (mtp_num_layers + 1)  # MTP + final logit
             )
+            +
+            # Self Attention
+            self_attn_term
         )
         return total_floating_point_operations
 
@@ -678,6 +816,7 @@ def transformer_flops():
         from operator import itemgetter
 
         from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols, get_hybrid_layer_counts
+
         num_mamba_layers, num_gdn_layers, num_attn_layers, num_mlp_layers, num_moe_layers = (
             itemgetter(Symbols.MAMBA, Symbols.GDN, Symbols.ATTENTION, Symbols.MLP, Symbols.MOE)(
                 get_hybrid_layer_counts(args.hybrid_layer_pattern)
@@ -689,8 +828,8 @@ def transformer_flops():
             mtp_num_layers = 0
         # Compute hybrid model FLOPs.
         return hybrid_flops(
-            batch_size=batch_size,
-            seq_len=args.seq_length,
+            seqlen_sum_this_global_batch=seqlen_sum_this_global_batch,
+            seqlen_squared_sum_this_global_batch=seqlen_squared_sum_this_global_batch,
             hidden_size=args.hidden_size,
             num_attn_layers=num_attn_layers,
             num_mamba_layers=num_mamba_layers,
@@ -708,10 +847,16 @@ def transformer_flops():
             mlp_expansion=args.ffn_hidden_size / args.hidden_size,
             swiglu=args.swiglu,
             moe_latent_size=args.moe_latent_size,
-            moe_ffn_hidden_size=(args.moe_ffn_hidden_size if args.moe_ffn_hidden_size is not None
-                                 else args.ffn_hidden_size),
-            shared_expert_ffn_hidden_size=(0 if args.moe_shared_expert_intermediate_size is None
-                                           else args.moe_shared_expert_intermediate_size),
+            moe_ffn_hidden_size=(
+                args.moe_ffn_hidden_size
+                if args.moe_ffn_hidden_size is not None
+                else args.ffn_hidden_size
+            ),
+            shared_expert_ffn_hidden_size=(
+                0
+                if args.moe_shared_expert_intermediate_size is None
+                else args.moe_shared_expert_intermediate_size
+            ),
             num_experts_routed_to=args.moe_router_topk,
             gdn_qk_head_dim=args.linear_key_head_dim or 128,
             gdn_v_head_dim=args.linear_value_head_dim or 128,
@@ -765,7 +910,9 @@ def _get_field(string, type):
                 # save_checkpoint was called directly (without save_checkpoint_and_time),
                 # which writes "Saved async checkpoint" but not "Saving async checkpoint".
                 if latest_num_floating_point_operations_uncommitted is not None:
-                    latest_num_floating_point_operations = latest_num_floating_point_operations_uncommitted
+                    latest_num_floating_point_operations = (
+                        latest_num_floating_point_operations_uncommitted
+                    )
                     latest_num_floating_point_operations_uncommitted = None
             if world_size_in_line != args.world_size:
                 # Re-start search if we see a different world size.
@@ -779,8 +926,10 @@ def _get_field(string, type):
     assert (
         start_time is not None and start_num_floating_point_operations is not None
     ), "Should have seen at least one 'Starting job' entry with same world_size"
-    print_rank_0(f"megatron.training.get_start_time_from_progress_log: "
-                 f"{start_time=}, {start_num_floating_point_operations=}")
+    print_rank_0(
+        f"megatron.training.get_start_time_from_progress_log: "
+        f"{start_time=}, {start_num_floating_point_operations=}"
+    )
     return datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S'), start_num_floating_point_operations
 
 
@@ -797,6 +946,7 @@ def preprocess_common_state_dict(common_state_dict):
         preprocessed_common_state_dict['args']['use_distributed_optimizer']
         and "optimizer" in preprocessed_common_state_dict
     ):
+
         def reorder_inner_param_groups(optimizer_state_dict):
             # When distributed optimizer loading, source param groups will be reordered,
             # so we reorder the param groups here to prevent warning.
@@ -911,11 +1061,12 @@ def pretrain(
     timers = get_timers()
 
     if args.fine_grained_activation_offloading:
-        from megatron.core.pipeline_parallel.utils import (
-            set_ideal_affinity_for_current_gpu
-        )
-        set_ideal_affinity_for_current_gpu()
+        from megatron.core.pipeline_parallel.utils import set_ideal_affinity_for_current_gpu
 
+        set_ideal_affinity_for_current_gpu()
+    if args.batch_invariant_mode:
+        print_rank_0("Enabling batch invariant mode globally", flush=True)
+        enable_batch_invariant_mode()
 
     if args.log_progress:
         append_to_progress_log("Starting job")
@@ -935,7 +1086,9 @@ def pretrain(
     # Initialize program_start_global with a fallback value in case set_startup_timestamps() wasn't called
     program_start_global = _TRAIN_START_TIME
     if _STARTUP_TIMESTAMPS['program_start'] is not None:
-        program_start_global = torch.tensor([_STARTUP_TIMESTAMPS['program_start']], dtype=torch.double, device='cuda')
+        program_start_global = torch.tensor(
+            [_STARTUP_TIMESTAMPS['program_start']], dtype=torch.double, device='cuda'
+        )
         torch.distributed.all_reduce(program_start_global, op=torch.distributed.ReduceOp.MIN)
         program_start_global = program_start_global.item()
     set_startup_timestamps(program_start=program_start_global)
@@ -955,26 +1108,41 @@ def pretrain(
     # Print basic megatron init time (using global min start)
     # NOTE(asolergi-nv): This is not entirely accurate, but we keep it for backwards compatibility.
     print_rank_0(
-        'time to initialize megatron (seconds): {:.3f}'.format(megatron_init_end - _LEGACY_TRAIN_START_TIME)
+        'time to initialize megatron (seconds): {:.3f}'.format(
+            megatron_init_end - _LEGACY_TRAIN_START_TIME
+        )
     )
 
     # Note, not entirely accurate as rank 0 might not be the first or last to hit these timestamps
-    print_datetime('after in-process setup and before initialize_megatron', timestamp_after_inprocess_setup)
-    print_datetime('after in-job setup and before initialize_megatron', timestamp_after_in_job_setup)
+    print_datetime(
+        'after in-process setup and before initialize_megatron', timestamp_after_inprocess_setup
+    )
+    print_datetime(
+        'after in-job setup and before initialize_megatron', timestamp_after_in_job_setup
+    )
 
     if program_start is not None and main_entry is not None and pretrain_entry is not None:
         # Inject startup deltas into timers
         startup_timers = {
-            'startup-program-entry-spread': program_start - program_start_global, # Local program start timestamp vs the global earliest program start timestamp
-            'startup-library-setup': main_entry - program_start, # Local library imports
-            'startup-program-setup': pretrain_entry - main_entry, # Local __main__ entry to pretrain entry
-            'startup-in-process-setup': timestamp_after_inprocess_setup - pretrain_entry, # Local in-process setup
-            'startup-in-job-setup': timestamp_after_in_job_setup - timestamp_after_inprocess_setup, # Local in-job setup
-            'startup-initialize-megatron': timestamp_after_initialize_megatron - timestamp_after_in_job_setup, # Local initialize megatron
-            'startup-set-jit-fusion-options': timestamp_after_set_jit_fusion_options - timestamp_after_initialize_megatron, # Local set JIT fusion options
-            'all-reduce-start-timestamps-tensor': megatron_init_end - timestamp_after_set_jit_fusion_options, # 2x All-reduce, first collective call
-            'startup-megatron-init-local': megatron_init_end - pretrain_entry, # Local megatron init
-            'startup-megatron-init-global': megatron_init_end - program_start_global, # Local megatron init vs the global earliest program start timestamp
+            'startup-program-entry-spread': program_start
+            - program_start_global,  # Local program start timestamp vs the global earliest program start timestamp
+            'startup-library-setup': main_entry - program_start,  # Local library imports
+            'startup-program-setup': pretrain_entry
+            - main_entry,  # Local __main__ entry to pretrain entry
+            'startup-in-process-setup': timestamp_after_inprocess_setup
+            - pretrain_entry,  # Local in-process setup
+            'startup-in-job-setup': timestamp_after_in_job_setup
+            - timestamp_after_inprocess_setup,  # Local in-job setup
+            'startup-initialize-megatron': timestamp_after_initialize_megatron
+            - timestamp_after_in_job_setup,  # Local initialize megatron
+            'startup-set-jit-fusion-options': timestamp_after_set_jit_fusion_options
+            - timestamp_after_initialize_megatron,  # Local set JIT fusion options
+            'all-reduce-start-timestamps-tensor': megatron_init_end
+            - timestamp_after_set_jit_fusion_options,  # 2x All-reduce, first collective call
+            'startup-megatron-init-local': megatron_init_end
+            - pretrain_entry,  # Local megatron init
+            'startup-megatron-init-global': megatron_init_end
+            - program_start_global,  # Local megatron init vs the global earliest program start timestamp
         }
         for name, delta in startup_timers.items():
             timers(name, log_level=0).set_elapsed(delta)
@@ -1003,8 +1171,8 @@ def pretrain(
                 LocalCheckpointManager,
             )
             from nvidia_resiliency_ext.checkpointing.local.replication.group_utils import (
-                parse_group_sequence,
                 GroupWrapper,
+                parse_group_sequence,
             )
             from nvidia_resiliency_ext.checkpointing.local.replication.strategies import (
                 CliqueReplicationStrategy,
@@ -1070,7 +1238,9 @@ def pretrain(
             # Build an isolated inference config so training config remains unchanged
             inference_config = copy.deepcopy(config)
             if args.rl_inference_tensor_model_parallel_size is not None:
-                inference_config.tensor_model_parallel_size = args.rl_inference_tensor_model_parallel_size
+                inference_config.tensor_model_parallel_size = (
+                    args.rl_inference_tensor_model_parallel_size
+                )
             if args.rl_inference_pipeline_model_parallel_size is not None:
                 inference_config.pipeline_model_parallel_size = (
                     args.rl_inference_pipeline_model_parallel_size
@@ -1134,11 +1304,15 @@ def pretrain(
         valid_data_iterator = []
         test_data_iterator = []
         for vp_stage in range(len(model)):
-            dataset_provider_parameters = inspect.signature(train_valid_test_dataset_provider).parameters
-            assert "vp_stage" in dataset_provider_parameters, \
-                "vp_stage must be a kwarg in train_valid_test_dataset_provider when using virtual pipeline parallelism"
-            vp_stage_train_valid_test_dataset_provider = \
-                functools.partial(train_valid_test_dataset_provider, vp_stage=vp_stage)
+            dataset_provider_parameters = inspect.signature(
+                train_valid_test_dataset_provider
+            ).parameters
+            assert (
+                "vp_stage" in dataset_provider_parameters
+            ), "vp_stage must be a kwarg in train_valid_test_dataset_provider when using virtual pipeline parallelism"
+            vp_stage_train_valid_test_dataset_provider = functools.partial(
+                train_valid_test_dataset_provider, vp_stage=vp_stage
+            )
             if getattr(train_valid_test_dataset_provider, 'is_distributed', False):
                 vp_stage_train_valid_test_dataset_provider.is_distributed = True
             iterators = build_train_valid_test_data_iterators(
@@ -1202,7 +1376,12 @@ def pretrain(
 
         print_datetime('after training is done')
 
-        if not args.skip_train and args.save and iteration != 0 and iteration % args.save_interval != 0:
+        if (
+            not args.skip_train
+            and args.save
+            and iteration != 0
+            and iteration % args.save_interval != 0
+        ):
             save_checkpoint_and_time(
                 iteration,
                 model,
@@ -1210,7 +1389,7 @@ def pretrain(
                 opt_param_scheduler,
                 num_floating_point_operations_so_far,
                 checkpointing_context,
-                train_data_iterator=train_data_iterator
+                train_data_iterator=train_data_iterator,
             )
 
         one_logger and one_logger.log_metrics(
@@ -1245,11 +1424,16 @@ def pretrain(
             )
         else:
             evaluate_and_print_results(
-                prefix, forward_step_func,
-                valid_data_iterator, model,
-                iteration, process_non_loss_data_func, config,
-                verbose=True, write_to_tensorboard=not args.skip_train,
-                non_loss_data_func=non_loss_data_func
+                prefix,
+                forward_step_func,
+                valid_data_iterator,
+                model,
+                iteration,
+                process_non_loss_data_func,
+                config,
+                verbose=True,
+                write_to_tensorboard=not args.skip_train,
+                non_loss_data_func=non_loss_data_func,
             )
 
     if args.do_test:
@@ -1310,7 +1494,13 @@ def update_train_iters(args):
     print_rank_0(f'setting training iterations to {args.train_iters}')
 
 
-def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True, config=None, pg_collection=None):
+def get_model(
+    model_provider_func,
+    model_type=ModelType.encoder_or_decoder,
+    wrap_with_ddp=True,
+    config=None,
+    pg_collection=None,
+):
     """Build the model."""
     args = get_args()
     args.model_type = model_type
@@ -1318,10 +1508,13 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
         pg_collection = ProcessGroupCollection.use_mpu_process_groups()
 
         if args.create_all_gather_group:
-            timeout = timedelta(minutes=args.distributed_timeout_minutes) if args.distributed_timeout_minutes else None
+            timeout = (
+                timedelta(minutes=args.distributed_timeout_minutes)
+                if args.distributed_timeout_minutes
+                else None
+            )
             dp_cp_ag, expt_dp_ag = create_all_gather_groups(
-                for_expert_parallelism=(args.expert_model_parallel_size > 1),
-                timeout=timeout,
+                for_expert_parallelism=(args.expert_model_parallel_size > 1), timeout=timeout
             )
             pg_collection.dp_cp_ag = dp_cp_ag
             pg_collection.expt_dp_ag = expt_dp_ag
@@ -1332,6 +1525,7 @@ def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap
 
     if has_nvidia_modelopt:
         from megatron.post_training.checkpointing import has_modelopt_state
+
         # [ModelOpt]: Check if the checkpoint is a ModelOpt checkpoint and
         # set a flag to use our model provider if so.
         if args.load is not None and has_modelopt_state(args.load):
@@ -1379,7 +1573,6 @@ def build_model():
             model.model_type = model_type
         return model
 
-
     if args.init_model_with_meta_device:
         with torch.device('meta'):
             model = build_model()
@@ -1405,9 +1598,7 @@ def build_model():
         print(
             ' > number of parameters on (tensor, pipeline) '
             'model parallel rank ({}, {}): {}'.format(
-                get_pg_rank(pg_collection.tp),
-                get_pg_rank(pg_collection.pp),
-                num_parameters,
+                get_pg_rank(pg_collection.tp), get_pg_rank(pg_collection.pp), num_parameters
             ),
             flush=True,
         )
@@ -1429,7 +1620,10 @@ def build_model():
 
     # Materialize tensors on meta device (GPU allocation) if not using FSDP2 and not using Megatron FSDP.
     if args.init_model_with_meta_device and not args.use_torch_fsdp2 and not args.use_megatron_fsdp:
-        model = [to_empty_if_meta_device(model_module, device=torch.device("cuda")) for model_module in model]
+        model = [
+            to_empty_if_meta_device(model_module, device=torch.device("cuda"))
+            for model_module in model
+        ]
 
     # Before TE2.x: The model_module.bfloat16()/model_module.half() above will call the inplace
     #               copy of TE's Float8Tensor, which will write an unwanted value (amax calculated
@@ -1451,7 +1645,9 @@ def build_model():
 
         if getattr(args, "use_torch_fsdp2", False):
             reshard_after_forward = getattr(args, "torch_fsdp2_reshard_after_forward", True)
-            ddp_config = TorchFullyShardedDataParallelConfig(reshard_after_forward=reshard_after_forward)
+            ddp_config = TorchFullyShardedDataParallelConfig(
+                reshard_after_forward=reshard_after_forward
+            )
         else:
             kwargs = {}
             for f in dataclasses.fields(DistributedDataParallelConfig):
@@ -1461,17 +1657,20 @@ def build_model():
             kwargs['check_for_nan_in_grad'] = args.check_for_nan_in_loss_and_grad
             kwargs['check_for_large_grads'] = args.check_for_large_grads
             if args.ddp_num_buckets is not None:
-                assert args.ddp_bucket_size is None, \
-                    "Cannot specify both --ddp-num-buckets and --ddp-bucket-size"
-                assert args.ddp_num_buckets > 0, \
-                    "--ddp-num-buckets must be greater than 0"
+                assert (
+                    args.ddp_bucket_size is None
+                ), "Cannot specify both --ddp-num-buckets and --ddp-bucket-size"
+                assert args.ddp_num_buckets > 0, "--ddp-num-buckets must be greater than 0"
                 kwargs['bucket_size'] = num_parameters // args.ddp_num_buckets
             else:
                 kwargs['bucket_size'] = args.ddp_bucket_size
             kwargs['pad_buckets_for_high_nccl_busbw'] = args.ddp_pad_buckets_for_high_nccl_busbw
-            kwargs['reduce_scatter_with_fp32_accumulation'] = args.ddp_reduce_scatter_with_fp32_accumulation
-            kwargs['param_name_patterns_for_fp32_local_accumulation'] = \
-                tuple(args.ddp_param_name_patterns_for_fp32_local_accumulation)
+            kwargs['reduce_scatter_with_fp32_accumulation'] = (
+                args.ddp_reduce_scatter_with_fp32_accumulation
+            )
+            kwargs['param_name_patterns_for_fp32_local_accumulation'] = tuple(
+                args.ddp_param_name_patterns_for_fp32_local_accumulation
+            )
             kwargs['average_in_collective'] = args.ddp_average_in_collective
             # Megatron-FSDP arguments.
             kwargs['megatron_fsdp_main_params_dtype'] = args.megatron_fsdp_main_params_dtype
@@ -1513,7 +1712,8 @@ def build_model():
                     module=model_chunk,
                     # Turn off bucketing for model_chunk 2 onwards, since communication
                     # for these model chunks is overlapped with compute anyway.
-                    disable_bucketing=(model_chunk_idx > 0) or args.overlap_param_gather_with_optimizer_step,
+                    disable_bucketing=(model_chunk_idx > 0)
+                    or args.overlap_param_gather_with_optimizer_step,
                     **dp_init_kwargs,
                 )
                 for (model_chunk_idx, model_chunk) in enumerate(model)
@@ -1602,11 +1802,7 @@ def get_megatron_optimizer_config(args: Any) -> OptimizerConfig:
     return config, config_overrides
 
 
-def setup_model_and_optimizer(
-    model_provider_func,
-    model_type,
-    checkpointing_context=None,
-):
+def setup_model_and_optimizer(model_provider_func, model_type, checkpointing_context=None):
     """Setup model and optimizer."""
     args = get_args()
     timers = get_timers()
@@ -1621,7 +1817,9 @@ def setup_model_and_optimizer(
     model = get_model(model_provider_func, model_type, wrap_with_ddp=wrap_with_ddp)
     unwrapped_model = unwrap_model(model)
 
-    one_logger and one_logger.log_metrics({"app_build_optimzer_start_time": one_logger_utils.get_timestamp_in_ms()})
+    one_logger and one_logger.log_metrics(
+        {"app_build_optimzer_start_time": one_logger_utils.get_timestamp_in_ms()}
+    )
     if skip_optimizer:
         optimizer, opt_param_scheduler = None, None
         # In RL inference-only mode, train_iters must still be set despite having no optimizer.
@@ -1652,7 +1850,9 @@ def setup_model_and_optimizer(
         )
         opt_param_scheduler = get_optimizer_param_scheduler(optimizer)
 
-    one_logger and one_logger.log_metrics({"app_build_optimzer_finish_time": one_logger_utils.get_timestamp_in_ms()})
+    one_logger and one_logger.log_metrics(
+        {"app_build_optimzer_finish_time": one_logger_utils.get_timestamp_in_ms()}
+    )
 
     if args.moe_use_upcycling:
         torch.distributed.barrier()
@@ -1790,23 +1990,45 @@ def dummy_train_step(data_iterator):
             batch = get_batch_on_this_cp_rank(batch)
 
 
-def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_scheduler, config, forward_backward_func, iteration=None):
+def train_step(
+    forward_step_func,
+    data_iterator,
+    model,
+    optimizer,
+    opt_param_scheduler,
+    config,
+    forward_backward_func,
+    iteration=None,
+):
     """Single training step."""
     args = get_args()
     timers = get_timers()
 
     rerun_state_machine = get_rerun_state_machine()
-    save_params_in_this_iteration = (args.save_params_interval is not None and
-                                     (iteration + 1) % args.save_params_interval == 0)
-    save_activations_in_this_iteration = (args.save_activations_interval is not None and
-                                          (iteration + 1) % args.save_activations_interval == 0)
-    save_tpe_in_this_iteration = (args.save_tokens_per_expert_interval is not None and
-                                  (iteration + 1) % args.save_tokens_per_expert_interval == 0)
-    save_wgrads_in_this_iteration = (args.save_wgrads_interval is not None and
-                                     (iteration + 1) % args.save_wgrads_interval == 0)
-    save_dgrads_in_this_iteration = (args.save_dgrads_interval is not None and
-                                     (iteration + 1) % args.save_dgrads_interval == 0)
+    save_params_in_this_iteration = (
+        args.save_params_interval is not None and (iteration + 1) % args.save_params_interval == 0
+    )
+    save_activations_in_this_iteration = (
+        args.save_activations_interval is not None
+        and (iteration + 1) % args.save_activations_interval == 0
+    )
+    save_tpe_in_this_iteration = (
+        args.save_tokens_per_expert_interval is not None
+        and (iteration + 1) % args.save_tokens_per_expert_interval == 0
+    )
+    save_wgrads_in_this_iteration = (
+        args.save_wgrads_interval is not None and (iteration + 1) % args.save_wgrads_interval == 0
+    )
+    save_dgrads_in_this_iteration = (
+        args.save_dgrads_interval is not None and (iteration + 1) % args.save_dgrads_interval == 0
+    )
     while rerun_state_machine.should_run_forward_backward(data_iterator):
+        # Offload optimizer states to CPU if enabled.
+        if args.offload_optimizer_states:
+            for optim_instance in optimizer.chained_optimizers:
+                if isinstance(optim_instance, DistributedOptimizer):
+                    optim_instance.offload_states()
+
         # Set grad to zero.
         for model_chunk in model:
             model_chunk.zero_grad_buffer()
@@ -1842,6 +2064,35 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
                     if isinstance(optim_instance, DistributedOptimizer):
                         optim_instance._copy_main_params_to_param_buffer()
 
+        # Release GPU memory for offloaded optimizer states.
+        # This needs to be done after _copy_main_params_to_param_buffer().
+        # Separate offload and release to allow early D2H transfer to overlap with other operations.
+        if args.offload_optimizer_states:
+            for optim_instance in optimizer.chained_optimizers:
+                if isinstance(optim_instance, DistributedOptimizer):
+                    optim_instance.release_offloaded_gpu_states()
+
+        if config.sequence_packing_scheduler is not None:
+            # This wrapper is designed to support DP-balanced THD and dynamic-CP.
+            # Before wrapping, the data_iterator returns either a single sequence per get_item call, or a list where each element is a sequence.
+            # The wrapper is responsible for:
+            # 1. scheduling the sequences across ranks
+            # 2. packing them into THD format
+            # 3. broadcast flops parametes and num_microbatches to TP ranks to support unfixed num_microbatches
+            # 4. broadcast metadata(cu_seqlens, cu_seqlens_padded, max_seqlen, etc.) to PP ranks to
+            # 5. returning the packed data iterator and the FLOPs parameters
+            (
+                data_iterator,
+                num_microbatches,
+                seqlen_sum_this_global_batch,
+                seqlen_squared_sum_this_global_batch,
+            ) = wrap_data_iterator(data_iterator, config, get_num_microbatches())
+        else:
+            # data_iterator unchanged
+            num_microbatches = get_num_microbatches()
+            seqlen_sum_this_global_batch = args.seq_length * args.global_batch_size
+            seqlen_squared_sum_this_global_batch = args.seq_length**2 * args.global_batch_size
+
         # Forward pass.
         if save_activations_in_this_iteration:
             enable_activation_logging(model, args.save)
@@ -1853,7 +2104,7 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
             forward_step_func=forward_step_func,
             data_iterator=data_iterator,
             model=model,
-            num_microbatches=get_num_microbatches(),
+            num_microbatches=num_microbatches,
             seq_length=args.seq_length,
             micro_batch_size=args.micro_batch_size,
             decoder_seq_length=args.decoder_seq_length,
@@ -1895,7 +2146,18 @@ def _save_state_dict(attr_name, label):
 
     should_checkpoint, should_exit, exit_code = rerun_state_machine.should_checkpoint_and_exit()
     if should_exit:
-        return {}, True, should_checkpoint, should_exit, exit_code, None, None, 0
+        return (
+            {},
+            True,
+            should_checkpoint,
+            should_exit,
+            exit_code,
+            None,
+            None,
+            0,
+            seqlen_sum_this_global_batch,
+            seqlen_squared_sum_this_global_batch,
+        )
 
     # Empty unused memory.
     if args.empty_unused_memory_level >= 1:
@@ -1960,8 +2222,7 @@ def _save_state_dict(attr_name, label):
                 # over the total number of tokens across the global batch.
                 val = torch.vstack(val).sum(dim=0)
                 torch.distributed.all_reduce(
-                    val,
-                    group=mpu.get_data_parallel_group(with_context_parallel=True)
+                    val, group=mpu.get_data_parallel_group(with_context_parallel=True)
                 )
                 loss_reduced[key] = val[0] / val[1]
             elif val[0].numel() == 1:
@@ -1979,8 +2240,21 @@ def _save_state_dict(attr_name, label):
             grad_norm,
             num_zeros_in_grad,
             log_max_attention_logit,
+            seqlen_sum_this_global_batch,
+            seqlen_squared_sum_this_global_batch,
         )
-    return {}, skipped_iter, should_checkpoint, should_exit, exit_code, grad_norm, num_zeros_in_grad, log_max_attention_logit
+    return (
+        {},
+        skipped_iter,
+        should_checkpoint,
+        should_exit,
+        exit_code,
+        grad_norm,
+        num_zeros_in_grad,
+        log_max_attention_logit,
+        seqlen_sum_this_global_batch,
+        seqlen_squared_sum_this_global_batch,
+    )
 
 
 def training_log(
@@ -1995,6 +2269,8 @@ def training_log(
     params_norm,
     num_zeros_in_grad,
     max_attention_logit,
+    seqlen_sum_this_global_batch,
+    seqlen_squared_sum_this_global_batch,
     pg_collection=None,
     is_first_iteration=False,
 ):
@@ -2038,35 +2314,39 @@ def training_log(
     # Logging.
     timers_to_log = []
     if args.timing_log_level >= 1:
-        timers_to_log.extend([
-            'forward-backward',
-            'layernorm-grads-all-reduce',
-            'embedding-grads-all-reduce',
-            'all-grads-sync',
-            'params-all-gather',
-            'optimizer-copy-to-main-grad',
-            'optimizer-unscale-and-check-inf',
-            'optimizer-clip-main-grad',
-            'optimizer-count-zeros',
-            'optimizer-inner-step',
-            'optimizer-copy-main-to-model-params',
-            'optimizer',
-        ])
+        timers_to_log.extend(
+            [
+                'forward-backward',
+                'layernorm-grads-all-reduce',
+                'embedding-grads-all-reduce',
+                'all-grads-sync',
+                'params-all-gather',
+                'optimizer-copy-to-main-grad',
+                'optimizer-unscale-and-check-inf',
+                'optimizer-clip-main-grad',
+                'optimizer-count-zeros',
+                'optimizer-inner-step',
+                'optimizer-copy-main-to-model-params',
+                'optimizer',
+            ]
+        )
     if args.timing_log_level >= 2:
-        timers_to_log.extend([
-            'batch-generator',
-            'forward-compute',
-            'backward-compute',
-            'forward-recv',
-            'forward-send',
-            'backward-recv',
-            'backward-send',
-            'forward-send-forward-recv',
-            'forward-send-backward-recv',
-            'backward-send-forward-recv',
-            'backward-send-backward-recv',
-            'forward-backward-send-forward-backward-recv',
-        ])
+        timers_to_log.extend(
+            [
+                'batch-generator',
+                'forward-compute',
+                'backward-compute',
+                'forward-recv',
+                'forward-send',
+                'backward-recv',
+                'backward-send',
+                'forward-send-forward-recv',
+                'forward-send-backward-recv',
+                'backward-send-forward-recv',
+                'backward-send-backward-recv',
+                'forward-backward-send-forward-backward-recv',
+            ]
+        )
     # Add timers from RL loop if needed.
     if getattr(args, 'perform_rl_step', False):
         timers_to_log.extend(RL_LOGGABLE_TIMER_NAMES)
@@ -2087,7 +2367,9 @@ def training_log(
             wandb_writer.log({'samples vs steps': args.consumed_train_samples}, iteration)
         if learning_rate is not None:
             writer.add_scalar('learning-rate', learning_rate, iteration)
-            writer.add_scalar('learning-rate vs samples', learning_rate, args.consumed_train_samples)
+            writer.add_scalar(
+                'learning-rate vs samples', learning_rate, args.consumed_train_samples
+            )
             if wandb_writer:
                 wandb_writer.log({'learning-rate': learning_rate}, iteration)
         if args.skipped_train_samples > 0:
@@ -2138,10 +2420,14 @@ def training_log(
             if wandb_writer:
                 wandb_writer.log({'params-norm': params_norm}, iteration)
         if args.perform_rl_step:
-            grpo_collection_iteration = iteration // (args.grpo_iterations * ( ( args.grpo_samples_per_iteration )// args.global_batch_size ))
+            grpo_collection_iteration = iteration // (
+                args.grpo_iterations * ((args.grpo_samples_per_iteration) // args.global_batch_size)
+            )
             writer.add_scalar('grpo_collection_iteration', grpo_collection_iteration, iteration)
             if wandb_writer:
-                wandb_writer.log({'grpo_collection_iteration': grpo_collection_iteration}, iteration)
+                wandb_writer.log(
+                    {'grpo_collection_iteration': grpo_collection_iteration}, iteration
+                )
         if args.log_memory_to_tensorboard:
             mem_stats = torch.cuda.memory_stats()
             writer.add_scalar(
@@ -2158,8 +2444,8 @@ def training_log(
             writer.add_scalar('max_attention_logit', max_attention_logit, iteration)
             if wandb_writer:
                 wandb_writer.log({'max_attention_logit': max_attention_logit}, iteration)
-
     # Log MoE metrics.
+    moe_log_string = ""
     if args.num_experts is not None:
         moe_loss_scale = 1 / get_num_microbatches()
         track_names = []
@@ -2176,18 +2462,19 @@ def training_log(
             from operator import itemgetter
 
             from megatron.core.ssm.mamba_hybrid_layer_allocation import (
-                Symbols, get_hybrid_layer_counts,
+                Symbols,
+                get_hybrid_layer_counts,
             )
+
             layers = itemgetter(Symbols.MOE)(get_hybrid_layer_counts(args.hybrid_layer_pattern))
         else:
             layers = args.num_layers
 
-        track_moe_metrics(
+        moe_log_string = get_moe_metrics_tracker().report(
             loss_scale=moe_loss_scale,
             iteration=iteration,
             writer=writer,
             wandb_writer=wandb_writer,
-            total_loss_dict=total_loss_dict,
             per_layer_logging=args.moe_per_layer_logging,
             force_initialize=True,
             track_names=track_names,
@@ -2195,7 +2482,16 @@ def training_log(
             moe_layer_freq=args.moe_layer_freq,
             mtp_num_layers=args.mtp_num_layers,
             pg_collection=pg_collection,
+            total_loss_dict=total_loss_dict,
         )
+        if getattr(args, 'log_moe_overload_factor', False):
+            overload_log_string = get_moe_overload_factor_tracker().report(
+                iteration=iteration,
+                writer=writer,
+                wandb_writer=wandb_writer,
+                per_layer_logging=args.moe_per_layer_logging,
+            )
+            moe_log_string = moe_log_string + overload_log_string
 
     # Log MTP metrics.
     if args.mtp_num_layers is not None:
@@ -2203,7 +2499,6 @@ def training_log(
         MTPLossLoggingHelper.track_mtp_metrics(
             mtp_loss_scale, iteration, writer, wandb_writer, total_loss_dict
         )
-
     # Track sparse attention indexer loss.
     if args.dsa_indexer_loss_coeff is not None and args.dsa_indexer_loss_coeff > 0:
         indexer_loss_scale = 1 / get_num_microbatches()
@@ -2214,10 +2509,11 @@ def training_log(
             wandb_writer=wandb_writer,
             total_loss_dict=total_loss_dict,
         )
-
     # Dump memory snapshot and print metrics to stdout.
     if iteration % args.log_interval == 0 or is_first_iteration:
-        if args.record_memory_history and (is_last_rank() or torch.distributed.get_backend() == 'fake'):
+        if args.record_memory_history and (
+            is_last_rank() or torch.distributed.get_backend() == 'fake'
+        ):
             snapshot = torch.cuda.memory._snapshot()
             from pickle import dump
 
@@ -2227,9 +2523,9 @@ def training_log(
         elapsed_time = timers('interval-time').elapsed(barrier=True, reset=should_reset)
         elapsed_time_per_iteration = elapsed_time / total_iterations
 
-        throughput = num_floating_point_operations(args, batch_size) / (
-            elapsed_time_per_iteration * 10**12 * args.world_size
-        )
+        throughput = num_floating_point_operations(
+            args, seqlen_sum_this_global_batch, seqlen_squared_sum_this_global_batch
+        ) / (elapsed_time_per_iteration * 10**12 * args.world_size)
 
         one_logger_utils.track_e2e_metrics(args.log_throughput, throughput)
 
@@ -2282,6 +2578,8 @@ def training_log(
                     log_string += ' {}: {:.6E} |'.format(key, avg)
                 if should_reset:
                     total_loss_dict[key] = torch.tensor([0.0], dtype=torch.float, device='cuda')
+        if args.num_experts is not None and moe_log_string:
+            log_string += moe_log_string
         log_string += f' loss scale: {loss_scale:.1f} |'
         if grad_norm is not None:
             log_string += f' grad norm: {grad_norm:.3f} |'
@@ -2297,7 +2595,7 @@ def training_log(
         # RL token throughput metrics.
         if args.perform_rl_step:
             log_string += rl_utils.log_rl_throughput_metrics(
-                args, batch_size, elapsed_time_per_iteration, iteration, wandb_writer,
+                args, batch_size, elapsed_time_per_iteration, iteration, wandb_writer
             )
 
         if should_reset:
@@ -2317,13 +2615,20 @@ def training_log(
             if iteration > (loaded_iteration + 1):
                 # Make sure the memory after the second iteration is reported to include optimizer state memory.
                 report_memory_flag = False
-        if args.log_memory_interval is not None and iteration % args.log_memory_interval == 0 and \
-            not reported_memory_in_this_iteration:
+        if (
+            args.log_memory_interval is not None
+            and iteration % args.log_memory_interval == 0
+            and not reported_memory_in_this_iteration
+        ):
             report_memory(f'(after {iteration} iterations)')
         # Write timers to wandb, don't reset the counts.
         if args.log_timers_to_tensorboard:
-            timers.write(timers_to_log, writer, iteration, normalizer=args.log_interval, reset=False)
-            timers.write(timers_to_log, wandb_writer, iteration, normalizer=args.log_interval, reset=False)
+            timers.write(
+                timers_to_log, writer, iteration, normalizer=args.log_interval, reset=False
+            )
+            timers.write(
+                timers_to_log, wandb_writer, iteration, normalizer=args.log_interval, reset=False
+            )
         # Log timers to stdout
         timers.log(timers_to_log, normalizer=args.log_interval, reset=should_reset)
 
@@ -2380,10 +2685,12 @@ def force_param_sync(model_chunks: list[DDP]) -> None:
         assert isinstance(model_chunk, DDP)
         model_chunk.start_param_sync(force_sync=True)
 
+
 # Only report memory for first 3 checkpoint saves.
 num_checkpoints_memory_reported = 0
 MAX_NUM_CHECKPOINTS_MEMORY_REPORTED = 3
 
+
 def save_checkpoint_and_time(
     iteration,
     model,
@@ -2404,7 +2711,8 @@ def save_checkpoint_and_time(
 
     # Stop timer to get accurate train interval time and exclude checkpointing duration
     timers('interval-time').stop()
-    energy_monitor.pause()
+    if args.log_energy:
+        energy_monitor.pause()
 
     # Extra barrier is added to make sure all ranks report the max time.
     timer_key = 'save-checkpoint-non-persistent' if non_persistent_ckpt else 'save-checkpoint'
@@ -2438,11 +2746,11 @@ def save_checkpoint_and_time(
         train_data_iterator=train_data_iterator,
         preprocess_common_state_dict_fn=preprocess_common_state_dict,
     )
-    
+
     # Stop timer and compute time elapsed to save checkpoint. Stop timer before timers.log() call as it resets the timer.
     timers(timer_key).stop(barrier=True)
     save_checkpoint_duration = timers(timer_key).elapsed(reset=False)
-    
+
     if should_report_memory:
         # Track memory after checkpoint save.
         report_memory(f"(after save_checkpoint for iteration {iteration})")
@@ -2467,7 +2775,9 @@ def save_checkpoint_and_time(
         )
 
     # Recover timing
-    energy_monitor.resume()
+    if args.log_energy:
+        energy_monitor.resume()
+
     timers('interval-time', log_level=0).start(barrier=True)
 
 
@@ -2478,7 +2788,7 @@ def post_training_step_callbacks(
     iteration,
     prof,
     num_floating_point_operations_since_last_log_event,
-    nsys_nvtx_context = None,
+    nsys_nvtx_context=None,
 ):
     """Run all post-training-step functions (e.g., FT heartbeats, GC)."""
     args = get_args()
@@ -2516,8 +2826,7 @@ def post_training_step_callbacks(
     if (
         args.profile
         and iteration == args.profile_step_end
-        and (len(args.profile_ranks) == 0 or
-             torch.distributed.get_rank() in args.profile_ranks)
+        and (len(args.profile_ranks) == 0 or torch.distributed.get_rank() in args.profile_ranks)
     ):
         # Disable NVTX range when profiling ends.
         if args.nvtx_ranges:
@@ -2629,12 +2938,8 @@ def checkpoint_and_decide_exit(
             return True
 
     # Exit based on iterations.
-    if (
-        args.exit_interval
-        and iteration % args.exit_interval == 0
-    ) or (
-        args.phase_transition_iterations
-        and iteration in args.phase_transition_iterations
+    if (args.exit_interval and iteration % args.exit_interval == 0) or (
+        args.phase_transition_iterations and iteration in args.phase_transition_iterations
     ):
         if args.save and not saved_checkpoint:
             save_checkpoint_and_time(
@@ -2690,29 +2995,31 @@ def train(
             args.load = None
             args.finetune = True
             load_checkpoint(
-                    model,
-                    None,  # Don't load optimizer state
-                    None,  # Don't load scheduler state
-                    checkpointing_context=checkpointing_context,
-                    skip_load_to_model_and_opt=HAVE_FSDP2
-                    and getattr(args, "use_torch_fsdp2", False)
-                    and args.ckpt_format == "torch_dist",
-                )
-            ref_state_dict = {k: (v.cpu() if v is not None else v) for k, v in model[0].state_dict().items()}
+                model,
+                None,  # Don't load optimizer state
+                None,  # Don't load scheduler state
+                checkpointing_context=checkpointing_context,
+                skip_load_to_model_and_opt=HAVE_FSDP2
+                and getattr(args, "use_torch_fsdp2", False)
+                and args.ckpt_format == "torch_dist",
+            )
+            ref_state_dict = {
+                k: (v.cpu() if v is not None else v) for k, v in model[0].state_dict().items()
+            }
 
             # Reload RL training checkpoint weights
             args.load = load
             args.finetune = finetune
             print_rank_0("> Reloading RL training checkpoint...")
             load_checkpoint(
-                    model,
-                    None,
-                    None,
-                    checkpointing_context=checkpointing_context,
-                    skip_load_to_model_and_opt=HAVE_FSDP2
-                    and getattr(args, "use_torch_fsdp2", False)
-                    and args.ckpt_format == "torch_dist",
-                )
+                model,
+                None,
+                None,
+                checkpointing_context=checkpointing_context,
+                skip_load_to_model_and_opt=HAVE_FSDP2
+                and getattr(args, "use_torch_fsdp2", False)
+                and args.ckpt_format == "torch_dist",
+            )
 
             args.no_load_optim = no_load_optim
 
@@ -2721,8 +3028,9 @@ def train(
         print_rank_0("> Reinitializing microbatch calculator for GRPO training...")
         from megatron.core.num_microbatches_calculator import (
             destroy_num_microbatches_calculator,
-            init_num_microbatches_calculator
+            init_num_microbatches_calculator,
         )
+
         # First destroy the existing calculator
         destroy_num_microbatches_calculator()
         # Then initialize with the correct perform_rl_step=True context
@@ -2740,14 +3048,12 @@ def train(
     energy_monitor = get_energy_monitor()
     one_logger = get_one_logger()
 
-    if args.hybrid_context_parallel:
-        train_data_iterator = iter(HybridCPDataLoaderWrapper(train_data_iterator, config))
-
     if args.run_workload_inspector_server:
         try:
-            from workload_inspector.utils.webserver import run_server
             import threading
 
+            from workload_inspector.utils.webserver import run_server
+
             threading.Thread(
                 target=run_server, daemon=True, args=(torch.distributed.get_rank(),)
             ).start()
@@ -2771,8 +3077,10 @@ def train(
     # Make sure rerun_state_machine has the right iteration loaded from checkpoint.
     rerun_state_machine = get_rerun_state_machine()
     if rerun_state_machine.current_iteration != iteration:
-        print_rank_0(f"Overwriting rerun_state_machine.current_iteration from "
-                     f"{rerun_state_machine.current_iteration} to {iteration}...")
+        print_rank_0(
+            f"Overwriting rerun_state_machine.current_iteration from "
+            f"{rerun_state_machine.current_iteration} to {iteration}..."
+        )
         rerun_state_machine.current_iteration = iteration
 
     # Track E2E metrics at the start of training.
@@ -2809,7 +3117,21 @@ def train(
         config.param_sync_func = [model_chunk.start_param_sync for model_chunk in model]
         if len(model) == 1:
             config.param_sync_func = config.param_sync_func[0]
-    config.finalize_model_grads_func = finalize_model_grads
+
+    # Wrap finalize_model_grads to reload offloaded optimizer states before grad finalization.
+    # This allows H2D transfer to overlap with grad all-reduce.
+    if args.offload_optimizer_states:
+
+        def finalize_model_grads_with_state_reload(*fmg_args, **fmg_kwargs):
+            # Reload offloaded states for all DistributedOptimizer instances
+            for optim_instance in optimizer.chained_optimizers:
+                if isinstance(optim_instance, DistributedOptimizer):
+                    optim_instance.reload_offloaded_states()
+            return finalize_model_grads(*fmg_args, **fmg_kwargs)
+
+        config.finalize_model_grads_func = finalize_model_grads_with_state_reload
+    else:
+        config.finalize_model_grads_func = finalize_model_grads
 
     if args.log_energy:
         energy_monitor.setup()
@@ -2853,9 +3175,13 @@ def train(
     # Wrap forward_backward_func for Full iteration CUDA graph
     forward_backward_func = get_forward_backward_func()
     if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope:
-        forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps)
+        forward_backward_func = FullCudaGraphWrapper(
+            forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps
+        )
     if args.optimizer_cuda_graph:
-        optimizer.step = OptimizerCudaGraphWrapper(optimizer.step, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps)
+        optimizer.step = OptimizerCudaGraphWrapper(
+            optimizer.step, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps
+        )
 
     def get_e2e_base_metrics():
         """Get base metrics values for one-logger to calculate E2E tracking metrics."""
@@ -2880,23 +3206,26 @@ def get_e2e_base_metrics():
             one_logger.store_set('get_e2e_base_metrics', get_e2e_base_metrics)
 
     prof = None
-    nsys_nvtx_context = None # reference to context for nsys profiling, so it can be cleaned up
+    nsys_nvtx_context = None  # reference to context for nsys profiling, so it can be cleaned up
     if (
         args.profile
-        and (len(args.profile_ranks) == 0 or
-             torch.distributed.get_rank() in args.profile_ranks)
+        and (len(args.profile_ranks) == 0 or torch.distributed.get_rank() in args.profile_ranks)
         and args.use_pytorch_profiler
     ):
         if args.pytorch_profiler_collect_chakra:
             et_dir = Path(f"{args.tensorboard_dir}/../chakra")
             et_dir.mkdir(parents=True, exist_ok=True)
-            et = torch.profiler.ExecutionTraceObserver().register_callback(f"{et_dir}/rank-{torch.distributed.get_rank()}.json.gz")
+            et = torch.profiler.ExecutionTraceObserver().register_callback(
+                f"{et_dir}/rank-{torch.distributed.get_rank()}.json.gz"
+            )
         else:
             et = None
+
         def trace_handler(p):
             profile_dir = Path(f"{args.tensorboard_dir}/../torch_profile")
             profile_dir.mkdir(parents=True, exist_ok=True)
             p.export_chrome_trace(f"{profile_dir}/rank-{torch.distributed.get_rank()}.json.gz")
+
         prof = torch.profiler.profile(
             schedule=torch.profiler.schedule(
                 wait=max(args.profile_step_start - 1, 0),
@@ -2943,9 +3272,9 @@ def trace_handler(p):
     # Run training iterations till done.
     buffered_rollouts = None
     while iteration < args.train_iters:
-        if (args.profile 
-            and (len(args.profile_ranks) == 0 or
-                 torch.distributed.get_rank() in args.profile_ranks)):
+        if args.profile and (
+            len(args.profile_ranks) == 0 or torch.distributed.get_rank() in args.profile_ranks
+        ):
             # Enable NVTX range when profiling starts and nvtx_ranges is set.
             if iteration == args.profile_step_start and args.nvtx_ranges:
                 configure_nvtx_profiling(True)
@@ -2953,7 +3282,9 @@ def trace_handler(p):
                 prof.step()
             elif iteration == args.profile_step_start:
                 torch.cuda.check_error(torch.cuda.cudart().cudaProfilerStart())
-                nsys_nvtx_context = torch.autograd.profiler.emit_nvtx(record_shapes=args.record_shapes)
+                nsys_nvtx_context = torch.autograd.profiler.emit_nvtx(
+                    record_shapes=args.record_shapes
+                )
                 nsys_nvtx_context.__enter__()
 
         ft_integration.on_checkpointing_start()
@@ -2962,7 +3293,10 @@ def trace_handler(p):
         # Update the timeout for all process groups after initialization
         # We update the timeout after the first successful iteration,
         # which takes longer than others usually
-        if args.distributed_timeout_seconds_after_init is not None and iteration == start_iteration+1:
+        if (
+            args.distributed_timeout_seconds_after_init is not None
+            and iteration == start_iteration + 1
+        ):
             # TODO: some dynamic timeout setting is required
             # based on the iteration time considering interval-based steps (e.g. eval, checkpoint)
             # e.g. timeout for normal iterations vs timeout for iterations with checkpoint
@@ -3038,7 +3372,11 @@ def trace_handler(p):
                 torch.cuda.empty_cache()
             with torch.no_grad():
                 train_data_iterator = rl_utils.get_grpo_data_iterator(
-                    model, inference_model, optimizer, iteration, ref_state_dict,
+                    model,
+                    inference_model,
+                    optimizer,
+                    iteration,
+                    ref_state_dict,
                     grpo_iterations=args.grpo_iterations,
                     grpo_prompts_per_step=args.grpo_prompts_per_step,
                     grpo_group_size=args.grpo_group_size,
@@ -3062,6 +3400,8 @@ def trace_handler(p):
             grad_norm = 0.0
             num_zeros_in_grad = 0
             max_attention_logit = None
+            seqlen_sum_this_global_batch = 0
+            seqlen_squared_sum_this_global_batch = 0
         else:
             ft_integration.on_training_step_start()
             (
@@ -3073,8 +3413,17 @@ def trace_handler(p):
                 grad_norm,
                 num_zeros_in_grad,
                 max_attention_logit,
+                seqlen_sum_this_global_batch,
+                seqlen_squared_sum_this_global_batch,
             ) = train_step(
-                forward_step_func, train_data_iterator, model, optimizer, opt_param_scheduler, config, forward_backward_func, iteration=iteration
+                forward_step_func,
+                train_data_iterator,
+                model,
+                optimizer,
+                opt_param_scheduler,
+                config,
+                forward_backward_func,
+                iteration=iteration,
             )
             ft_integration.on_training_step_end()
         if should_checkpoint:
@@ -3124,7 +3473,7 @@ def trace_handler(p):
             getattr(args, "fsdp_manual_registration", False)
             and getattr(args, "nccl_ub", False)
             and getattr(args, "use_megatron_fsdp", False)
-            and iteration ==  start_iteration + 1
+            and iteration == start_iteration + 1
         ):
             for model_chunk in model:
                 if isinstance(model_chunk, megatron_FSDP) and getattr(
@@ -3161,7 +3510,9 @@ def trace_handler(p):
         else:
             assert num_skipped_samples_in_batch == 0
         args.skipped_train_samples += num_skipped_samples_in_batch
-        num_floating_point_operations_in_batch = num_floating_point_operations(args, batch_size)
+        num_floating_point_operations_in_batch = num_floating_point_operations(
+            args, seqlen_sum_this_global_batch, seqlen_squared_sum_this_global_batch
+        )
         num_floating_point_operations_so_far += num_floating_point_operations_in_batch
         num_floating_point_operations_since_last_log_event += num_floating_point_operations_in_batch
 
@@ -3190,14 +3541,20 @@ def trace_handler(p):
             params_norm,
             num_zeros_in_grad,
             max_attention_logit,
+            seqlen_sum_this_global_batch,
+            seqlen_squared_sum_this_global_batch,
             pg_collection=model_pg_collection,
             is_first_iteration=is_first_iteration,
         )
         is_first_iteration = False
 
         # Evaluation.
-        if args.eval_interval and iteration % args.eval_interval == 0 and args.do_valid \
-                and (args.start_eval_at_iter is None or iteration >= args.start_eval_at_iter):
+        if (
+            args.eval_interval
+            and iteration % args.eval_interval == 0
+            and args.do_valid
+            and (args.start_eval_at_iter is None or iteration >= args.start_eval_at_iter)
+        ):
             if args.log_energy:
                 energy_monitor.pause()
             timers('interval-time').stop()
@@ -3231,14 +3588,23 @@ def trace_handler(p):
                     training_model=rl_training_model,
                 )
             else:
-                evaluate_and_print_results(prefix, forward_step_func,
-                                       valid_data_iterator, model,
-                                       iteration, process_non_loss_data_func,
-                                       config, verbose=False, write_to_tensorboard=True,
-                                       non_loss_data_func=non_loss_data_func)
+                evaluate_and_print_results(
+                    prefix,
+                    forward_step_func,
+                    valid_data_iterator,
+                    model,
+                    iteration,
+                    process_non_loss_data_func,
+                    config,
+                    verbose=False,
+                    write_to_tensorboard=True,
+                    non_loss_data_func=non_loss_data_func,
+                )
 
             eval_duration += timers('eval-time').elapsed()
-            eval_iterations += sum(args.eval_iters) if isinstance(args.eval_iters, list) else args.eval_iters
+            eval_iterations += (
+                sum(args.eval_iters) if isinstance(args.eval_iters, list) else args.eval_iters
+            )
             timers('eval-time').stop()
             one_logger_utils.track_e2e_metrics()
 
@@ -3252,7 +3618,9 @@ def trace_handler(p):
             if args.log_energy:
                 energy_monitor.resume()
             if args.num_experts is not None:
-                clear_aux_losses_tracker()
+                get_moe_metrics_tracker().clear()
+                if getattr(args, 'log_moe_overload_factor', False):
+                    get_moe_overload_factor_tracker().clear()
 
         # Miscellaneous post-training-step functions (e.g., FT heartbeats, GC).
         # Some of these only happen at specific iterations. Capture updated FLOPs accumulator
@@ -3358,7 +3726,15 @@ def evaluate(
     eval_num_microbatches = eval_batch_size // (eval_micro_batch_size * args.data_parallel_size)
     forward_backward_func = get_forward_backward_func()
     if args.cuda_graph_impl == "local" and CudaGraphScope.full_iteration in args.cuda_graph_scope:
-        forward_backward_func = FullCudaGraphWrapper(forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps)
+        forward_backward_func = FullCudaGraphWrapper(
+            forward_backward_func, cuda_graph_warmup_steps=args.cuda_graph_warmup_steps
+        )
+    # Wrap forward_backward_func for overflow handling with moe_expert_rank_capacity_factor
+    if args.moe_expert_rank_capacity_factor is not None:
+        copy_main_params = args.reuse_grad_buf_for_mxfp8_param_ag and args.overlap_param_gather
+        forward_backward_func = PagedStashRunner(
+            config, copy_main_params, model, None, forward_backward_func
+        )
 
     if has_nvidia_modelopt:
         # [ModelOpt]: Pipeline-parallel Distillation stacks student and teacher tensors
@@ -3386,11 +3762,30 @@ def evaluate(
             # Don't care about timing during evaluation
             config.timers = None
             ft_integration.on_eval_step_start()
+            if config.sequence_packing_scheduler is not None:
+                # This wrapper is designed to support DP-balanced THD and dynamic-CP.
+                # Before wrapping, the data_iterator returns either a single sequence per get_item call, or a list where each element is a sequence.
+                # The wrapper is responsible for:
+                # 1. scheduling the sequences across ranks
+                # 2. packing them into THD format
+                # 3. broadcast flops parametes and num_microbatches to TP ranks to support unfixed num_microbatches
+                # 4. broadcast metadata(cu_seqlens, cu_seqlens_padded, max_seqlen, etc.) to PP ranks to
+                # 5. returning the packed data iterator and the FLOPs parameters
+                try:
+                    (packed_data_iterator, scheduled_eval_num_microbatches, _, _) = (
+                        wrap_data_iterator(data_iterator, config, eval_num_microbatches)
+                    )
+                except StopIteration:
+                    # Validation data iterator exhausted, stop evaluation early.
+                    break
+            else:
+                packed_data_iterator = data_iterator
+                scheduled_eval_num_microbatches = eval_num_microbatches
             loss_dicts = forward_backward_func(
                 forward_step_func=forward_step_func,
-                data_iterator=data_iterator,
+                data_iterator=packed_data_iterator,
                 model=model,
-                num_microbatches=eval_num_microbatches,
+                num_microbatches=scheduled_eval_num_microbatches,
                 seq_length=args.seq_length,
                 micro_batch_size=eval_micro_batch_size,
                 decoder_seq_length=args.decoder_seq_length,
@@ -3408,7 +3803,9 @@ def evaluate(
                 # Reduce across processes.
                 for key in loss_dicts[0].keys():
                     if key not in total_loss_dict:
-                        total_loss_dict[key] = torch.tensor([0.0, 0.0], dtype=torch.float, device='cuda')
+                        total_loss_dict[key] = torch.tensor(
+                            [0.0, 0.0], dtype=torch.float, device='cuda'
+                        )
                     val = [x[key].view(-1) for x in loss_dicts]
 
                     if val[0].numel() == 2:
@@ -3418,19 +3815,17 @@ def evaluate(
                             val = val[:, 0] / val[:, 1].clamp(min=1)
                             val = val.mean()
                             torch.distributed.all_reduce(
-                                val,
-                                group=mpu.get_data_parallel_group(with_context_parallel=True)
+                                val, group=mpu.get_data_parallel_group(with_context_parallel=True)
                             )
                             val /= torch.distributed.get_world_size(
                                 group=mpu.get_data_parallel_group(with_context_parallel=True)
                             )
                             total_loss_dict[key][0] += val
                             total_loss_dict[key][1] += 1
-                        else :
+                        else:
                             val = torch.vstack(val).sum(dim=0)
                             torch.distributed.all_reduce(
-                                val,
-                                group=mpu.get_data_parallel_group(with_context_parallel=True)
+                                val, group=mpu.get_data_parallel_group(with_context_parallel=True)
                             )
                             total_loss_dict[key] += val
                     elif val[0].numel() == 1:
@@ -3553,7 +3948,9 @@ def evaluate_and_print_results(
             ppl = math.exp(min(20, total_loss_dict[key].item()))
             string += '{} PPL: {:.6E} | '.format(key, ppl)
             if writer:
-                writer.add_scalar('{} validation{}'.format(key, suffix), total_loss_dict[key].item(), iteration)
+                writer.add_scalar(
+                    '{} validation{}'.format(key, suffix), total_loss_dict[key].item(), iteration
+                )
                 writer.add_scalar(
                     '{} validation{} vs samples'.format(key, suffix),
                     total_loss_dict[key].item(),
@@ -3562,11 +3959,14 @@ def evaluate_and_print_results(
                 if args.log_validation_ppl_to_tensorboard:
                     writer.add_scalar('{} validation{} ppl'.format(key, suffix), ppl, iteration)
                     writer.add_scalar(
-                        '{} validation{} ppl vs samples'.format(key, suffix), ppl, args.consumed_train_samples
+                        '{} validation{} ppl vs samples'.format(key, suffix),
+                        ppl,
+                        args.consumed_train_samples,
                     )
                 if wandb_writer and is_last_rank():
                     wandb_writer.log(
-                        {'{} validation{}'.format(key, suffix): total_loss_dict[key].item()}, iteration
+                        {'{} validation{}'.format(key, suffix): total_loss_dict[key].item()},
+                        iteration,
                     )
 
         if process_non_loss_data_func is not None and writer and is_last_rank():
@@ -3611,7 +4011,11 @@ def get_train_valid_test_num_samples():
 
     # Get train_samples in current phase.
     if args.phase_transition_iterations:
-        phase_transition_samples = [0] + [t * args.global_batch_size for t in args.phase_transition_iterations] + [args.train_samples]
+        phase_transition_samples = (
+            [0]
+            + [t * args.global_batch_size for t in args.phase_transition_iterations]
+            + [args.train_samples]
+        )
         current_sample = args.iteration * args.global_batch_size
         last_transition_sample = max(s for s in phase_transition_samples if s <= current_sample)
         next_transition_sample = min(s for s in phase_transition_samples if s > current_sample)
@@ -3622,18 +4026,24 @@ def get_train_valid_test_num_samples():
     return (train_samples_in_current_phase, eval_samples, test_samples)
 
 
-def build_train_valid_test_datasets(build_train_valid_test_datasets_provider, train_valid_test_num_samples=None):
+def build_train_valid_test_datasets(
+    build_train_valid_test_datasets_provider, train_valid_test_num_samples=None, vp_stage=None
+):
     """Build pretraining datasets."""
     if train_valid_test_num_samples is None:
         train_valid_test_num_samples = get_train_valid_test_num_samples()
-    print_rank_0(' > datasets target sizes (minimum size):')
     print_rank_0('    train:      {}'.format(train_valid_test_num_samples[0]))
     print_rank_0('    validation: {}'.format(train_valid_test_num_samples[1]))
     print_rank_0('    test:       {}'.format(train_valid_test_num_samples[2]))
-    return build_train_valid_test_datasets_provider(train_valid_test_num_samples)
+    if vp_stage is not None:
+        return build_train_valid_test_datasets_provider(
+            train_valid_test_num_samples, vp_stage=vp_stage
+        )
+    else:
+        return build_train_valid_test_datasets_provider(train_valid_test_num_samples)
 
 
-def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider):
+def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider, vp_stage=None):
     """Build pretraining data loaders."""
 
     args = get_args()
@@ -3661,8 +4071,14 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider
 
     # Get consumed train samples in this phase.
     if args.phase_transition_iterations:
-        last_transition = max(iteration for iteration in (0, *args.phase_transition_iterations) if iteration <= args.iteration)
-        consumed_train_samples_in_current_phase = (args.iteration - last_transition) * args.global_batch_size
+        last_transition = max(
+            iteration
+            for iteration in (0, *args.phase_transition_iterations)
+            if iteration <= args.iteration
+        )
+        consumed_train_samples_in_current_phase = (
+            args.iteration - last_transition
+        ) * args.global_batch_size
     else:
         consumed_train_samples_in_current_phase = args.consumed_train_samples
 
@@ -3679,17 +4095,21 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider
             valid_dataloaders = None
             test_dataloader = None
             do_train = (args.train_iters or 0) > 0
-            do_valid = (args.full_validation or args.eval_iters > 0)
-            do_test = (args.full_validation or args.eval_iters > 0)
+            do_valid = args.full_validation or args.eval_iters > 0
+            do_test = args.full_validation or args.eval_iters > 0
 
         else:
             # Build datasets.
-            train_ds, valid_ds, test_ds = build_train_valid_test_datasets(build_train_valid_test_datasets_provider)
+            train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
+                build_train_valid_test_datasets_provider, vp_stage=vp_stage
+            )
             valid_ds = [valid_ds] if not isinstance(valid_ds, list) else valid_ds
             if args.skip_train:
                 train_dataloader = None
             else:
-                train_dataloader = build_pretraining_data_loader(train_ds, consumed_train_samples_in_current_phase)
+                train_dataloader = build_pretraining_data_loader(
+                    train_ds, consumed_train_samples_in_current_phase
+                )
             valid_dataloaders = []
             for valid_d in valid_ds:
                 if args.skip_train or args.full_validation:
@@ -3698,13 +4118,19 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider
                     if args.multiple_validation_sets:
                         # TODO(bnorick): for multiple validation sets without full validation, args.consumed_valid_samples is not
                         # correct and needs to be calculated/set per validation set
-                        raise NotImplementedError("--multiple-validation-sets currently requires --full-validation")
-                    valid_dataloaders.append(build_pretraining_data_loader(valid_d, args.consumed_valid_samples))
+                        raise NotImplementedError(
+                            "--multiple-validation-sets currently requires --full-validation"
+                        )
+                    valid_dataloaders.append(
+                        build_pretraining_data_loader(valid_d, args.consumed_valid_samples)
+                    )
             if not args.multiple_validation_sets:
                 assert len(valid_dataloaders) == 1
             test_dataloader = build_pretraining_data_loader(test_ds, 0)
             do_train = train_dataloader is not None and (args.skip_train or args.train_iters > 0)
-            do_valid = valid_dataloaders is not None and (args.full_validation or args.eval_iters > 0)
+            do_valid = valid_dataloaders is not None and (
+                args.full_validation or args.eval_iters > 0
+            )
             do_test = test_dataloader is not None and (args.full_validation or args.eval_iters > 0)
 
         flags = torch.tensor(
@@ -3721,14 +4147,14 @@ def build_train_valid_test_data_loaders(build_train_valid_test_datasets_provider
     return train_dataloader, valid_dataloaders, test_dataloader
 
 
-def build_train_valid_test_data_iterators(build_train_valid_test_datasets_provider):
+def build_train_valid_test_data_iterators(build_train_valid_test_datasets_provider, vp_stage=None):
     """Build pretraining data iterators."""
 
     args = get_args()
 
     # Build loaders.
     train_dataloader, valid_dataloaders, test_dataloader = build_train_valid_test_data_loaders(
-        build_train_valid_test_datasets_provider
+        build_train_valid_test_datasets_provider, vp_stage=vp_stage
     )
 
     # Build iterators.
@@ -3762,7 +4188,7 @@ def _get_iterator(dataloader_type, dataloader):
         if args.full_validation:
             if args.multiple_validation_sets:
                 if valid_dataloaders[0] is None:
-                    args.eval_iters = [None]*len(valid_dataloaders)
+                    args.eval_iters = [None] * len(valid_dataloaders)
                 else:
                     args.eval_iters = [len(dl) for dl in valid_dataloaders]
             else:
diff --git a/megatron/training/utils.py b/megatron/training/utils.py
index aaa10c7edcd..6581193a067 100644
--- a/megatron/training/utils.py
+++ b/megatron/training/utils.py
@@ -5,14 +5,14 @@
 import os
 import sys
 import warnings
+from collections import defaultdict
 from contextlib import contextmanager
 from datetime import datetime
-from collections import defaultdict
 
 import torch
 
-from megatron.core.msc_utils import MultiStorageClientFeature, open_file
 from megatron.core._rank_utils import safe_get_rank as _safe_get_rank
+from megatron.core.msc_utils import MultiStorageClientFeature, open_file
 
 try:
     from transformer_engine.pytorch.optimizers import multi_tensor_applier, multi_tensor_l2norm
@@ -32,18 +32,18 @@
             local_multi_tensor_applier as multi_tensor_applier,
         )
 
-from megatron.training import get_args, get_timers, get_adlr_autoresume
 from megatron.core import mpu
 from megatron.core.datasets.utils import get_blend_from_list
 from megatron.core.tensor_parallel import param_is_not_tensor_parallel_duplicate
+from megatron.core.transformer.module import param_is_not_shared
 from megatron.core.utils import (
     get_batch_on_this_cp_rank,
     get_data_parallel_group_if_dtensor,
+    is_torch_min_version,
     to_local_if_dtensor,
     unwrap_model,
 )
-
-from megatron.core.transformer.module import param_is_not_shared
+from megatron.training import get_adlr_autoresume, get_args, get_timers
 
 
 def calc_params_l2_norm(model, force_create_fp32_copy=False):
@@ -148,7 +148,7 @@ def calc_params_l2_norm(model, force_create_fp32_copy=False):
     torch.distributed.all_reduce(
         sharded_norm_2,
         op=torch.distributed.ReduceOp.SUM,
-        group=mpu.get_data_parallel_group(with_context_parallel=True)
+        group=mpu.get_data_parallel_group(with_context_parallel=True),
     )
     norm_2 += sharded_norm_2
 
@@ -209,24 +209,22 @@ def calc_dtensor_params_l2_norm(params):
             norm = torch.zeros((1,), dtype=torch.float32, device='cuda')
         else:
             norm, _ = multi_tensor_applier(
-                multi_tensor_l2norm, dummy_overflow_buf, [local_tensors], False  # no per-parameter norm.
+                multi_tensor_l2norm,
+                dummy_overflow_buf,
+                [local_tensors],
+                False,  # no per-parameter norm.
             )
         norm_2 = norm * norm
         for pg, placement in zip(
-            dtensor_spec.device_mesh.get_all_groups(),
-            dtensor_spec.placements,
+            dtensor_spec.device_mesh.get_all_groups(), dtensor_spec.placements
         ):
             if placement.is_shard():
-                torch.distributed.all_reduce(
-                    norm_2, op=torch.distributed.ReduceOp.SUM, group=pg
-                )
+                torch.distributed.all_reduce(norm_2, op=torch.distributed.ReduceOp.SUM, group=pg)
             elif placement.is_replicate():
                 # Replicated parameters are already summed across all ranks.
                 pass
             else:
-                raise RuntimeError(
-                    f"Unsupported placement {placement} for Megatron FSDP."
-                )
+                raise RuntimeError(f"Unsupported placement {placement} for Megatron FSDP.")
         total_norm_2 += norm_2
 
     return total_norm_2.item() ** 0.5
@@ -286,7 +284,8 @@ def report_memory(name):
     string += f" | max allocated: {torch.cuda.max_memory_allocated() / mega_bytes:.2f}"
     string += f" | reserved: {torch.cuda.memory_reserved() / mega_bytes:.2f}"
     string += f" | max reserved: {torch.cuda.max_memory_reserved() / mega_bytes:.2f}"
-    if args.log_device_memory_used:
+    if args.log_device_memory_used and is_torch_min_version("2.6.0"):
+        # device usage is not supported in torch < 2.6.0
         string += f" | total device memory used: {torch.cuda.device_memory_used() / mega_bytes:.2f}"
     if mpu.get_data_parallel_rank() == 0:
         print("[Rank {}] {}".format(torch.distributed.get_rank(), string), flush=True)
@@ -329,13 +328,15 @@ def check_adlr_autoresume_termination(iteration, model, optimizer, opt_param_sch
         sys.exit(0)
 
 
-def get_ltor_masks_and_position_ids(data,
-                                    eod_token,
-                                    pad_token,
-                                    reset_position_ids,
-                                    reset_attention_mask,
-                                    eod_mask_loss,
-                                    pad_mask_loss):
+def get_ltor_masks_and_position_ids(
+    data,
+    eod_token,
+    pad_token,
+    reset_position_ids,
+    reset_attention_mask,
+    eod_mask_loss,
+    pad_mask_loss,
+):
     """Build masks and position id for left to right model."""
 
     # Extract batch size and sequence length.
@@ -369,7 +370,9 @@ def get_ltor_masks_and_position_ids(data,
         for b in range(micro_batch_size):
 
             # Find indecies where EOD token is.
-            eod_index = position_ids[b, data[b] == eod_token] & position_ids[b, data[b] == pad_token]
+            eod_index = (
+                position_ids[b, data[b] == eod_token] & position_ids[b, data[b] == pad_token]
+            )
             # Detach indecies from positions if going to modify positions.
             if reset_position_ids:
                 eod_index = eod_index.clone()
@@ -443,10 +446,9 @@ def is_first_or_last_pipeline_stage(vp_stage):
     ignore_virtual = True
     if vp_stage is not None:
         ignore_virtual = False
-    return (
-        mpu.is_pipeline_first_stage(ignore_virtual=ignore_virtual, vp_stage=vp_stage)
-        or mpu.is_pipeline_last_stage(ignore_virtual=ignore_virtual, vp_stage=vp_stage)
-    )
+    return mpu.is_pipeline_first_stage(
+        ignore_virtual=ignore_virtual, vp_stage=vp_stage
+    ) or mpu.is_pipeline_last_stage(ignore_virtual=ignore_virtual, vp_stage=vp_stage)
 
 
 def get_device_arch_version():
@@ -547,14 +549,10 @@ def _broadcast(item):
             ),
             'position_ids': data["position_ids"].cuda(non_blocking=True),
             'cu_seqlens': (
-                None
-                if "cu_seqlens" not in data
-                else data["cu_seqlens"].cuda(non_blocking=True)
+                None if "cu_seqlens" not in data else data["cu_seqlens"].cuda(non_blocking=True)
             ),
             'max_seqlen': (
-                None
-                if "max_seqlen" not in data
-                else data["max_seqlen"].cuda(non_blocking=True)
+                None if "max_seqlen" not in data else data["max_seqlen"].cuda(non_blocking=True)
             ),
             'local_cp_size': (
                 None
@@ -566,7 +564,7 @@ def _broadcast(item):
         def _broadcast_cu_seqlens(cu_seqlens):
             dev = torch.cuda.current_device()
             n = 0 if cu_seqlens is None else int(cu_seqlens.numel())
-            n_tensor = torch.tensor(n, dtype=torch.int64, device=dev)
+            n_tensor = torch.empty(1, dtype=torch.int64, device=dev).fill_(n)
             _broadcast(n_tensor)
 
             if n == 0:
@@ -578,10 +576,12 @@ def _broadcast_cu_seqlens(cu_seqlens):
                 buf = cu_seqlens.to(device=dev, non_blocking=True).contiguous()
             _broadcast(buf)
 
-        if args.hybrid_context_parallel:
-            seq_len = torch.tensor(batch['tokens'].shape[0], dtype=torch.int32, device=torch.cuda.current_device())
+        if args.dynamic_context_parallel:
+            seq_len = torch.tensor(
+                batch['tokens'].shape[0], dtype=torch.int32, device=torch.cuda.current_device()
+            )
             _broadcast(seq_len)
-            
+
         if args.pipeline_model_parallel_size == 1 or mtp_on_this_rank:
             _broadcast(batch['tokens'])
             _broadcast(batch['labels'])
@@ -608,57 +608,39 @@ def _broadcast_cu_seqlens(cu_seqlens):
             _broadcast(batch['attention_mask'])
 
     else:
-        if args.hybrid_context_parallel:
+        if args.dynamic_context_parallel:
             seq_len = torch.tensor(0, dtype=torch.int32, device=torch.cuda.current_device())
             _broadcast(seq_len)
-            shape = (seq_len.item())
+            shape = seq_len.item()
         else:
             shape = (args.micro_batch_size, args.seq_length)
-            
-        tokens = torch.empty(
-            shape,
-            dtype=torch.int64,
-            device=torch.cuda.current_device(),
-        )
-        labels = torch.empty(
-            shape,
-            dtype=torch.int64,
-            device=torch.cuda.current_device(),
-        )
-        loss_mask = torch.empty(
-            shape,
-            dtype=torch.float32,
-            device=torch.cuda.current_device(),
-        )
+
+        tokens = torch.empty(shape, dtype=torch.int64, device=torch.cuda.current_device())
+        labels = torch.empty(shape, dtype=torch.int64, device=torch.cuda.current_device())
+        loss_mask = torch.empty(shape, dtype=torch.float32, device=torch.cuda.current_device())
         if args.create_attention_mask_in_dataloader:
-            shape_attention_mask = (args.micro_batch_size, 1, args.seq_length, args.seq_length) if not args.hybrid_context_parallel else (1, 1, shape[0], shape[0])
+            shape_attention_mask = (
+                (args.micro_batch_size, 1, args.seq_length, args.seq_length)
+                if not args.dynamic_context_parallel
+                else (1, 1, shape[0], shape[0])
+            )
             attention_mask = torch.empty(
-                shape_attention_mask,
-                dtype=torch.bool,
-                device=torch.cuda.current_device(),
+                shape_attention_mask, dtype=torch.bool, device=torch.cuda.current_device()
             )
         else:
             attention_mask = None
-        position_ids = torch.empty(
-            shape,
-            dtype=torch.int64,
-            device=torch.cuda.current_device(),
-        )
+        position_ids = torch.empty(shape, dtype=torch.int64, device=torch.cuda.current_device())
         cu_seqlens = None
-        if args.hybrid_context_parallel or args.sft:
-            max_seqlen = torch.empty(
-                1,
-                dtype=torch.int32,
-                device=torch.cuda.current_device(),
-            )
+        if args.dynamic_context_parallel or args.sft:
+            max_seqlen = torch.empty(1, dtype=torch.int32, device=torch.cuda.current_device())
         else:
             max_seqlen = None
-        
-        local_cp_size = torch.empty(
-            1,
-            dtype=torch.int32,
-            device=torch.cuda.current_device(),
-        ) if args.hybrid_context_parallel else None
+
+        local_cp_size = (
+            torch.empty(1, dtype=torch.int32, device=torch.cuda.current_device())
+            if args.dynamic_context_parallel
+            else None
+        )
 
         def _broadcast_cu_seqlens():
             dev = torch.cuda.current_device()
@@ -734,7 +716,7 @@ def to_empty_if_meta_device(module: torch.nn.Module, *, device: torch.device, re
     accidently overwrite buffers with precomputed values during construction. Given the
     goal is to only materialize those tensors on meta device, this function checks the
     device first and only move the tensor to the destination if it is not on meta device.
-   
+
     Args:
         module: The target module to apply this transformation.
         device: The desired device of the parameters
@@ -749,9 +731,7 @@ def _empty_like_if_meta(tensor: torch.Tensor, *, device: torch.device):
         else:
             return tensor.to(device)
 
-    return module._apply(
-        lambda t: _empty_like_if_meta(t, device=device), recurse=recurse
-    )
+    return module._apply(lambda t: _empty_like_if_meta(t, device=device), recurse=recurse)
 
 
 def get_nvtx_range():
@@ -788,6 +768,7 @@ def has_nvrx_installed():
     """Checks if nvidia-resiliency-ext is installed."""
     try:
         import nvidia_resiliency_ext
+
         return True
     except (ImportError, ModuleNotFoundError):
         return False
diff --git a/megatron/training/yaml_arguments.py b/megatron/training/yaml_arguments.py
index 3a2d04aadcf..8e4de13df4d 100644
--- a/megatron/training/yaml_arguments.py
+++ b/megatron/training/yaml_arguments.py
@@ -7,39 +7,43 @@
 import json
 import os
 import re
-import torch
 import types
-import yaml
-
 from itertools import chain, starmap
 from types import SimpleNamespace
 
+import torch
 import torch.nn.functional as F
+import yaml
 
-from megatron.core.transformer import TransformerConfig, MLATransformerConfig
+from megatron.core.transformer import MLATransformerConfig, TransformerConfig
 from megatron.core.utils import get_torch_version, is_torch_min_version
 
 # Taken from https://stackoverflow.com/questions/65414773/parse-environment-variable-from-yaml-with-pyyaml
 # Allows for yaml to use environment variables
 env_pattern = re.compile(r".*?\${(.*?)}.*?")
+
+
 def env_constructor(loader, node):
     value = loader.construct_scalar(node)
     for group in env_pattern.findall(value):
         assert os.environ.get(group) is not None, f"environment variable {group} in yaml not found"
         value = value.replace(f"${{{group}}}", os.environ.get(group))
     return value
+
+
 yaml.add_implicit_resolver("!pathex", env_pattern)
 yaml.add_constructor("!pathex", env_constructor)
 
 
 str_dtype_to_torch = {
-    "float32" : torch.float32,
-    "float16" : torch.float16,
-    "bfloat16" : torch.bfloat16
+    "float32": torch.float32,
+    "float16": torch.float16,
+    "bfloat16": torch.bfloat16,
 }
 
+
 def validate_yaml(args, defaults={}):
-    
+
     # This is for legacy script env var setting
     if type(args.data_path) is str:
         # If no white space its a single path
@@ -49,42 +53,63 @@ def validate_yaml(args, defaults={}):
 
     # Tensor model parallel size.
     args.model_parallel.tensor_model_parallel_size = min(
-        args.model_parallel.tensor_model_parallel_size, args.world_size)
-    assert args.world_size % args.model_parallel.tensor_model_parallel_size == 0, 'world size'\
-        ' ({}) is not divisible by tensor model parallel size ({})'.format(
-            args.world_size, args.model_parallel.tensor_model_parallel_size)
+        args.model_parallel.tensor_model_parallel_size, args.world_size
+    )
+    assert (
+        args.world_size % args.model_parallel.tensor_model_parallel_size == 0
+    ), 'world size' ' ({}) is not divisible by tensor model parallel size ({})'.format(
+        args.world_size, args.model_parallel.tensor_model_parallel_size
+    )
     # Pipeline model parallel size.
     args.model_parallel.pipeline_model_parallel_size = min(
         args.model_parallel.pipeline_model_parallel_size,
-        (args.world_size // args.model_parallel.tensor_model_parallel_size))
+        (args.world_size // args.model_parallel.tensor_model_parallel_size),
+    )
     args.model_parallel.transformer_pipeline_model_parallel_size = (
         args.model_parallel.pipeline_model_parallel_size - 1
-        if args.account_for_embedding_in_pipeline_split else
-        args.model_parallel.pipeline_model_parallel_size
+        if args.account_for_embedding_in_pipeline_split
+        else args.model_parallel.pipeline_model_parallel_size
     )
     # Checks.
-    model_parallel_size = args.model_parallel.pipeline_model_parallel_size * \
-                          args.model_parallel.tensor_model_parallel_size
-    assert args.world_size % (model_parallel_size * args.model_parallel.context_parallel_size) == 0, \
-        'world size ({}) is not divisible by tensor parallel size ({}) times ' \
+    model_parallel_size = (
+        args.model_parallel.pipeline_model_parallel_size
+        * args.model_parallel.tensor_model_parallel_size
+    )
+    assert (
+        args.world_size % (model_parallel_size * args.model_parallel.context_parallel_size) == 0
+    ), (
+        'world size ({}) is not divisible by tensor parallel size ({}) times '
         'pipeline parallel size ({}) times context parallel size ({})'.format(
-        args.world_size, args.model_parallel.tensor_model_parallel_size,
-        args.model_parallel.pipeline_model_parallel_size, args.model_parallel.context_parallel_size)
-    
+            args.world_size,
+            args.model_parallel.tensor_model_parallel_size,
+            args.model_parallel.pipeline_model_parallel_size,
+            args.model_parallel.context_parallel_size,
+        )
+    )
+
     # data_parallel_size is not in model parallel config
-    args.data_parallel_size = args.world_size // (model_parallel_size * args.model_parallel.context_parallel_size)
+    args.data_parallel_size = args.world_size // (
+        model_parallel_size * args.model_parallel.context_parallel_size
+    )
     if args.rank == 0:
-        print('using world size: {}, data-parallel size: {}, '
-              'context-parallel size: {}, '
-              'tensor-model-parallel size: {}, '
-              'pipeline-model-parallel size: {}'.format(
-                  args.world_size, args.data_parallel_size,
-                  args.model_parallel.context_parallel_size,
-                  args.model_parallel.tensor_model_parallel_size,
-                  args.model_parallel.pipeline_model_parallel_size), flush=True)
+        print(
+            'using world size: {}, data-parallel size: {}, '
+            'context-parallel size: {}, '
+            'tensor-model-parallel size: {}, '
+            'pipeline-model-parallel size: {}'.format(
+                args.world_size,
+                args.data_parallel_size,
+                args.model_parallel.context_parallel_size,
+                args.model_parallel.tensor_model_parallel_size,
+                args.model_parallel.pipeline_model_parallel_size,
+            ),
+            flush=True,
+        )
 
     if args.model_parallel.tp_comm_overlap:
-        assert args.model_parallel.sequence_parallel == True, 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
+        assert (
+            args.model_parallel.sequence_parallel == True
+        ), 'Tensor parallel communication/GEMM overlap can happen only when sequence parallelism is enabled'
 
     # Set input defaults.
     for key in defaults:
@@ -93,10 +118,13 @@ def validate_yaml(args, defaults={}):
         # ensuring the arg is set to None.
         if getattr(args, key, None) is not None:
             if args.rank == 0:
-                print('WARNING: overriding default arguments for {key}:{v} \
-                       with {key}:{v2}'.format(key=key, v=defaults[key],
-                                               v2=getattr(args, key)),
-                                               flush=True)
+                print(
+                    'WARNING: overriding default arguments for {key}:{v} \
+                       with {key}:{v2}'.format(
+                        key=key, v=defaults[key], v2=getattr(args, key)
+                    ),
+                    flush=True,
+                )
         else:
             setattr(args, key, defaults[key])
 
@@ -107,14 +135,11 @@ def validate_yaml(args, defaults={}):
         args, '_is_global_batch_size_explicitly_specified', args.global_batch_size is not None
     )
     if args.step_batch_size_schedule is not None and is_global_batch_size_explicitly_specified:
-        raise ValueError(
-            'Cannot specify both --step-batch-size-schedule and --global-batch-size'
-        )
+        raise ValueError('Cannot specify both --step-batch-size-schedule and --global-batch-size')
     if args.global_batch_size is None:
         args.global_batch_size = args.micro_batch_size * args.data_parallel_size
         if args.rank == 0:
-            print('setting global batch size to {}'.format(
-                args.global_batch_size), flush=True)
+            print('setting global batch size to {}'.format(args.global_batch_size), flush=True)
     assert args.global_batch_size > 0
 
     # Eval batch size.
@@ -122,35 +147,50 @@ def validate_yaml(args, defaults={}):
         args.eval_global_batch_size = args.global_batch_size
     if getattr(args, 'eval_micro_batch_size', None) is None:
         args.eval_micro_batch_size = args.micro_batch_size
-    assert args.eval_global_batch_size % (args.eval_micro_batch_size * args.data_parallel_size) == 0, \
-        f"eval_global_batch_size ({args.eval_global_batch_size}) must be divisible by " \
+    assert (
+        args.eval_global_batch_size % (args.eval_micro_batch_size * args.data_parallel_size) == 0
+    ), (
+        f"eval_global_batch_size ({args.eval_global_batch_size}) must be divisible by "
         f"eval_micro_batch_size ({args.eval_micro_batch_size}) * data_parallel_size ({args.data_parallel_size})"
+    )
 
     # num_layers_per_virtual_pipeline_stage is not insde model parallel for checkpointing
     if args.num_layers_per_virtual_pipeline_stage is not None:
-        assert args.model_parallel.pipeline_model_parallel_size > 2, \
-            'pipeline-model-parallel size should be greater than 2 with ' \
-            'interleaved schedule'
-        assert args.language_model.num_layers % args.model_parallel.transformer_pipeline_model_parallel_size == 0, \
-            'number of layers should be divisible by the pipeline parallel size'
-        num_layers_per_pipeline_stage = args.language_model.num_layers // args.model_parallel.transformer_pipeline_model_parallel_size
-        assert num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0, \
-            'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage'
-        args.model_parallel.virtual_pipeline_model_parallel_size = num_layers_per_pipeline_stage // \
-            args.num_layers_per_virtual_pipeline_stage
+        assert args.model_parallel.pipeline_model_parallel_size > 2, (
+            'pipeline-model-parallel size should be greater than 2 with ' 'interleaved schedule'
+        )
+        assert (
+            args.language_model.num_layers
+            % args.model_parallel.transformer_pipeline_model_parallel_size
+            == 0
+        ), 'number of layers should be divisible by the pipeline parallel size'
+        num_layers_per_pipeline_stage = (
+            args.language_model.num_layers
+            // args.model_parallel.transformer_pipeline_model_parallel_size
+        )
+        assert (
+            num_layers_per_pipeline_stage % args.num_layers_per_virtual_pipeline_stage == 0
+        ), 'number of layers per pipeline stage must be divisible number of layers per virtual pipeline stage'
+        args.model_parallel.virtual_pipeline_model_parallel_size = (
+            num_layers_per_pipeline_stage // args.num_layers_per_virtual_pipeline_stage
+        )
     else:
         args.model_parallel.virtual_pipeline_model_parallel_size = None
         # Overlap P2P communication is disabled if not using the interleaved schedule.
         args.model_parallel.overlap_p2p_comm = False
         if args.rank == 0:
-            print('WARNING: Setting args.overlap_p2p_comm to False since non-interleaved '
-                  'schedule does not support overlapping p2p communication')
+            print(
+                'WARNING: Setting args.overlap_p2p_comm to False since non-interleaved '
+                'schedule does not support overlapping p2p communication'
+            )
 
     if args.overlap_param_gather:
-        assert args.use_distributed_optimizer, \
-            '--overlap-param-gather only supported with distributed optimizer'
-        assert args.overlap_grad_reduce, \
-            '--overlap-grad-reduce should be turned on when using --overlap-param-gather'
+        assert (
+            args.use_distributed_optimizer
+        ), '--overlap-param-gather only supported with distributed optimizer'
+        assert (
+            args.overlap_grad_reduce
+        ), '--overlap-grad-reduce should be turned on when using --overlap-param-gather'
 
     # Parameters dtype.
     if args.model_parallel.fp16:
@@ -164,12 +204,13 @@ def validate_yaml(args, defaults={}):
         if not args.accumulate_allreduce_grads_in_fp32:
             args.accumulate_allreduce_grads_in_fp32 = True
             if args.rank == 0:
-                print('accumulate and all-reduce gradients in fp32 for '
-                      'bfloat16 data type.', flush=True)
+                print(
+                    'accumulate and all-reduce gradients in fp32 for ' 'bfloat16 data type.',
+                    flush=True,
+                )
 
     if args.rank == 0:
-        print('using {} for parameters ...'.format(args.model_parallel.params_dtype),
-              flush=True)
+        print('using {} for parameters ...'.format(args.model_parallel.params_dtype), flush=True)
 
     if args.dataloader_type is None:
         args.dataloader_type = 'single'
@@ -189,39 +230,36 @@ def validate_yaml(args, defaults={}):
     if args.train_iters:
         # If we use iteration-based training, make sure the
         # sample-based options are off.
-        assert args.train_samples is None, \
-            'expected iteration-based training'
-        assert args.lr_decay_samples is None, \
-            'expected iteration-based learning rate decay'
-        assert args.lr_warmup_samples == 0, \
-            'expected iteration-based learning rate warmup'
+        assert args.train_samples is None, 'expected iteration-based training'
+        assert args.lr_decay_samples is None, 'expected iteration-based learning rate decay'
+        assert args.lr_warmup_samples == 0, 'expected iteration-based learning rate warmup'
         if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_iters == 0, \
-                'can only specify one of lr-warmup-fraction and lr-warmup-iters'
+            assert (
+                args.lr_warmup_iters == 0
+            ), 'can only specify one of lr-warmup-fraction and lr-warmup-iters'
 
     # Sample-based training.
     if args.train_samples:
         # If we use sample-based training, make sure the
         # iteration-based options are off.
-        assert args.train_iters is None, \
-            'expected sample-based training'
-        assert args.lr_decay_iters is None, \
-            'expected sample-based learning rate decay'
-        assert args.lr_warmup_iters == 0, \
-            'expected sample-based learnig rate warmup'
+        assert args.train_iters is None, 'expected sample-based training'
+        assert args.lr_decay_iters is None, 'expected sample-based learning rate decay'
+        assert args.lr_warmup_iters == 0, 'expected sample-based learnig rate warmup'
         if args.lr_warmup_fraction is not None:
-            assert args.lr_warmup_samples == 0, \
-                'can only specify one of lr-warmup-fraction ' \
-                'and lr-warmup-samples'
+            assert args.lr_warmup_samples == 0, (
+                'can only specify one of lr-warmup-fraction ' 'and lr-warmup-samples'
+            )
 
     # How to handle this better
     if args.language_model.num_layers is not None:
-        assert args.encoder_num_layers is None, \
-            'cannot have both num-layers and encoder-num-layers specified'
+        assert (
+            args.encoder_num_layers is None
+        ), 'cannot have both num-layers and encoder-num-layers specified'
         args.encoder_num_layers = args.language_model.num_layers
     else:
-        assert args.encoder_num_layers is not None, \
-            'either num-layers or encoder-num-layers should be specified'
+        assert (
+            args.encoder_num_layers is not None
+        ), 'either num-layers or encoder-num-layers should be specified'
         args.language_model.num_layers = args.encoder_num_layers
 
     # Check required arguments.
@@ -238,15 +276,19 @@ def validate_yaml(args, defaults={}):
             # the same ballpark as the counterpart with 4*h size
             # we keep it a multiple of 64, which means the actual tensor size
             # will be a multiple of 64 / tp_size
-            args.language_model.ffn_hidden_size = int((4 * args.language_model.hidden_size * 2 / 3) / 64) * 64
+            args.language_model.ffn_hidden_size = (
+                int((4 * args.language_model.hidden_size * 2 / 3) / 64) * 64
+            )
         else:
             args.language_model.ffn_hidden_size = 4 * args.language_model.hidden_size
 
     if args.language_model.kv_channels is None:
         assert args.language_model.hidden_size % args.language_model.num_attention_heads == 0
-        args.language_model.kv_channels = args.language_model.hidden_size // args.language_model.num_attention_heads
+        args.language_model.kv_channels = (
+            args.language_model.hidden_size // args.language_model.num_attention_heads
+        )
 
-    #TODO: Implement arguments for encoder-decoder
+    # TODO: Implement arguments for encoder-decoder
     if args.seq_length is not None:
         assert args.encoder_seq_length is None
         args.encoder_seq_length = args.seq_length
@@ -266,8 +308,9 @@ def validate_yaml(args, defaults={}):
     if args.fp16_lm_cross_entropy:
         assert args.fp16, 'lm cross entropy in fp16 only support in fp16 mode.'
     if args.language_model.fp32_residual_connection:
-        assert args.model_parallel.fp16 or args.model_parallel.bf16, \
-            'residual connection in fp32 only supported when using fp16 or bf16.'
+        assert (
+            args.model_parallel.fp16 or args.model_parallel.bf16
+        ), 'residual connection in fp32 only supported when using fp16 or bf16.'
 
     if args.language_model.moe_grouped_gemm:
         assert args.model_parallel.bf16, 'Currently GroupedGEMM for MoE only supports bf16 dtype.'
@@ -287,30 +330,33 @@ def validate_yaml(args, defaults={}):
     if not is_torch_min_version("1.11.0a0"):
         args.language_model.persist_layer_norm = False
         if args.rank == 0:
-            print('Persistent fused layer norm kernel is supported from '
-                  'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
-                  'Defaulting to no_persist_layer_norm=True')
+            print(
+                'Persistent fused layer norm kernel is supported from '
+                'pytorch v1.11 (nvidia pytorch container paired with v1.11). '
+                'Defaulting to no_persist_layer_norm=True'
+            )
 
     # Activation recomputing.
     if args.language_model.distribute_saved_activations:
-        assert args.model_parallel.tensor_model_parallel_size > 1, 'can distribute ' \
-            'recomputed activations only across tensor model ' \
-            'parallel groups'
-        assert args.language_model.recompute_granularity == 'full', \
-            'distributed recompute activations is only '\
-            'application to full recompute granularity'
-        assert args.language_model.recompute_method is not None, \
-            'for distributed recompute activations to work you '\
-            'need to use a recompute method '
-        assert is_torch_min_version("1.10.0a0"), \
-            'distributed recompute activations are supported for pytorch ' \
-            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current ' \
+        assert args.model_parallel.tensor_model_parallel_size > 1, (
+            'can distribute ' 'recomputed activations only across tensor model ' 'parallel groups'
+        )
+        assert args.language_model.recompute_granularity == 'full', (
+            'distributed recompute activations is only ' 'application to full recompute granularity'
+        )
+        assert args.language_model.recompute_method is not None, (
+            'for distributed recompute activations to work you ' 'need to use a recompute method '
+        )
+        assert is_torch_min_version("1.10.0a0"), (
+            'distributed recompute activations are supported for pytorch '
+            'v1.10 and above (Nvidia Pytorch container >= 21.07). Current '
             f'pytorch version is v{get_torch_version()}.'
+        )
 
     if args.language_model.recompute_granularity == 'selective':
-        assert args.language_model.recompute_method is None, \
-            'recompute method is not yet supported for ' \
-            'selective recomputing granularity'
+        assert args.language_model.recompute_method is None, (
+            'recompute method is not yet supported for ' 'selective recomputing granularity'
+        )
 
     # disable sequence parallelism when tp=1
     # to avoid change in numerics when
@@ -322,27 +368,34 @@ def validate_yaml(args, defaults={}):
         if args.model_parallel.sequence_parallel:
             raise RuntimeError(
                 "Using sequence parallelism requires setting the environment variable "
-                "CUDA_DEVICE_MAX_CONNECTIONS to 1")
-    
+                "CUDA_DEVICE_MAX_CONNECTIONS to 1"
+            )
+
     # MoE Spec check
     if args.language_model.num_moe_experts is not None:
         assert args.spec is None, "Model Spec must be None when using MoEs"
         if args.model_parallel.tensor_model_parallel_size > 1:
-            assert args.model_parallel.sequence_parallel, \
-                "When using MoE and tensor parallelism, sequence parallelism must be used."
+            assert (
+                args.model_parallel.sequence_parallel
+            ), "When using MoE and tensor parallelism, sequence parallelism must be used."
 
     # Expert parallelism check
-    if args.model_parallel.expert_model_parallel_size  > 1:
-        assert args.language_model.num_moe_experts is not None, "num_experts must be non None to use expert model parallelism"
-        assert args.language_model.num_moe_experts % args.model_parallel.expert_model_parallel_size == 0, \
-            "Number of experts should be a multiple of expert model parallel_size."
-        assert not args.model_parallel.fp16, \
-            "Expert parallelism is not supported with fp16 training."
+    if args.model_parallel.expert_model_parallel_size > 1:
+        assert (
+            args.language_model.num_moe_experts is not None
+        ), "num_experts must be non None to use expert model parallelism"
+        assert (
+            args.language_model.num_moe_experts % args.model_parallel.expert_model_parallel_size
+            == 0
+        ), "Number of experts should be a multiple of expert model parallel_size."
+        assert (
+            not args.model_parallel.fp16
+        ), "Expert parallelism is not supported with fp16 training."
 
     # Print arguments.
     _print_args("arguments", args)
 
-    #TODO: Added as much of the global initialization requires the model parallel arguments
+    # TODO: Added as much of the global initialization requires the model parallel arguments
     args = SimpleNamespace(**args.__dict__, **args.model_parallel.__dict__)
     args = SimpleNamespace(**args.__dict__, **args.language_model.__dict__)
     # For GPT Layer spec in pretrain_gpt
@@ -350,19 +403,19 @@ def validate_yaml(args, defaults={}):
 
     return args
 
+
 def _print_args(title, args):
     """Print arguments."""
     if args.rank == 0:
-        print(f'------------------------ {title} ------------------------',
-              flush=True)
+        print(f'------------------------ {title} ------------------------', flush=True)
         str_list = []
         for arg in vars(args):
             dots = '.' * (48 - len(arg))
             str_list.append('  {} {} {}'.format(arg, dots, getattr(args, arg)))
         for arg in sorted(str_list, key=lambda x: x.lower()):
             print(arg, flush=True)
-        print(f'-------------------- end of {title} ---------------------',
-              flush=True)
+        print(f'-------------------- end of {title} ---------------------', flush=True)
+
 
 def core_config_from_args(args, dataclass=TransformerConfig):
     """Builds core config object from namespace args from given dataclass
@@ -370,7 +423,7 @@ def core_config_from_args(args, dataclass=TransformerConfig):
     Raises exception if argument missing in args
 
     Args:
-        args(SimpleNamespace, optional): Namespace to pull argument values from 
+        args(SimpleNamespace, optional): Namespace to pull argument values from
         dataclass (dataclass, optional): Core dataclass config to pull argument names from
 
 
@@ -385,28 +438,36 @@ def core_config_from_args(args, dataclass=TransformerConfig):
             raise Exception(f"Missing argument {f.name} for {str(dataclass)} config")
     return kw_args
 
+
 def _check_arg_is_not_none(args, arg):
     assert getattr(args, arg) is not None, '{} argument is None'.format(arg)
 
-def core_transformer_config_from_yaml(args, transfomer_key = "language_model"):    
+
+def core_transformer_config_from_yaml(args, transfomer_key="language_model"):
     # Combine transfomer config with model parallel args
     args = SimpleNamespace(**vars(getattr(args, transfomer_key)), **vars(args.model_parallel))
     # Translate args to core transformer configuration
-    kw_args = core_config_from_args(args, TransformerConfig)    
-    
-    # Hardcoded 
+    kw_args = core_config_from_args(args, TransformerConfig)
+
+    # Hardcoded
     kw_args['deallocate_pipeline_outputs'] = True
     kw_args['pipeline_dtype'] = kw_args['params_dtype']
-    kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm 
-    
-    assert args.activation_func in ["swiglu","squaredrelu","gelu"], f"{args.activation_func} is not a supported activation function"
+    kw_args['batch_p2p_comm'] = not args.overlap_p2p_comm
+
+    assert args.activation_func in [
+        "swiglu",
+        "squaredrelu",
+        "gelu",
+    ], f"{args.activation_func} is not a supported activation function"
     if args.activation_func == "swiglu":
         kw_args['activation_func'] = F.silu
         kw_args['gated_linear_unit'] = True
         kw_args['bias_activation_fusion'] = args.bias_swiglu_fusion
     elif args.activation_func == "squaredrelu":
+
         def squared_relu(x):
             return torch.pow(F.relu(x), 2)
+
         kw_args['activation_func'] = squared_relu
     elif args.activation_func == "gelu":
         kw_args['activation_func'] = F.gelu
@@ -414,29 +475,31 @@ def squared_relu(x):
             kw_args['bias_activation_fusion'] = False
         else:
             kw_args['bias_activation_fusion'] = args.bias_activation_fusion
-    
+
     if args.init_method == "xavier_uniform":
         kw_args['init_method'] = torch.nn.init.xavier_uniform_
         kw_args['scaled_init_method'] = torch.nn.init.xavier_uniform_
     if args.embedding_init_method == "xavier_uniform":
         kw_args['embedding_init_method'] = torch.nn.init.xavier_uniform_
-    
+
     # Return Transformer config.
     if getattr(args, "multi_latent_attention", False):
         return MLATransformerConfig(**kw_args)
     else:
         return TransformerConfig(**kw_args)
 
+
 def load_yaml(yaml_path):
     print(f"warning using experimental yaml arguments feature, argparse arguments will be ignored")
     with open(yaml_path, "r") as f:
         config = yaml.safe_load(f)
         # Convert to nested namespace
-        config_namespace = json.loads(json.dumps(config), object_hook=lambda item: SimpleNamespace(**item))
+        config_namespace = json.loads(
+            json.dumps(config), object_hook=lambda item: SimpleNamespace(**item)
+        )
         # Add config location to namespace
         config_namespace.yaml_cfg = yaml_path
         config_namespace._is_global_batch_size_explicitly_specified = (
             getattr(config_namespace, "global_batch_size", None) is not None
         )
         return config_namespace
-
diff --git a/pretrain_bert.py b/pretrain_bert.py
index a5a75d87879..65b267c46e1 100644
--- a/pretrain_bert.py
+++ b/pretrain_bert.py
@@ -7,25 +7,29 @@
 import torch
 import torch.nn.functional as F
 
-from megatron.training import get_args
-from megatron.training import print_rank_0
-from megatron.training import get_timers
-from megatron.core import tensor_parallel
+from megatron.core import mpu, tensor_parallel
+from megatron.core.datasets.bert_dataset import (
+    BERTMaskedWordPieceDataset,
+    BERTMaskedWordPieceDatasetConfig,
+)
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.utils import get_blend_from_list
 from megatron.core.enums import ModelType
+from megatron.core.models.bert.bert_layer_specs import (
+    bert_layer_local_spec,
+    bert_layer_with_transformer_engine_spec,
+)
 from megatron.core.models.bert.bert_model import BertModel
-from megatron.training import pretrain
-from megatron.training.utils import average_losses_across_data_parallel_group
-from megatron.training.arguments import core_transformer_config_from_args, parse_and_validate_args
-from megatron.core.transformer.spec_utils import import_module
-from megatron.core.models.bert.bert_layer_specs import bert_layer_with_transformer_engine_spec, bert_layer_local_spec
 from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer
-from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
-from megatron.core.datasets.bert_dataset import BERTMaskedWordPieceDataset, BERTMaskedWordPieceDatasetConfig
-from megatron.core.datasets.utils import get_blend_from_list
-from megatron.core import mpu, tensor_parallel
+from megatron.core.transformer.spec_utils import import_module
+from megatron.training import get_args, get_timers, pretrain, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args, parse_and_validate_args
+from megatron.training.utils import average_losses_across_data_parallel_group
 
 
-def model_provider(pre_process=True, post_process=True, vp_stage=None, config=None, pg_collection=None):
+def model_provider(
+    pre_process=True, post_process=True, vp_stage=None, config=None, pg_collection=None
+):
     """Build the model."""
 
     print_rank_0('building BERT model ...')
@@ -36,11 +40,11 @@ def model_provider(pre_process=True, post_process=True, vp_stage=None, config=No
     num_tokentypes = 2 if args.bert_binary_head else 0
 
     if args.spec is None:
-        transformer_layer_spec = bert_layer_with_transformer_engine_spec #default spec
+        transformer_layer_spec = bert_layer_with_transformer_engine_spec  # default spec
     elif args.spec[0] == 'local':
         print_rank_0('Using Local spec for transformer layers')
         transformer_layer_spec = bert_layer_local_spec
-    else :
+    else:
         transformer_layer_spec = import_module(args.spec)
 
     model = BertModel(
@@ -54,7 +58,8 @@ def model_provider(pre_process=True, post_process=True, vp_stage=None, config=No
         parallel_output=True,
         pre_process=pre_process,
         post_process=post_process,
-        vp_stage=vp_stage)
+        vp_stage=vp_stage,
+    )
 
     return model
 
@@ -63,8 +68,7 @@ def get_batch(data_iterator):
     """Build the batch."""
 
     # Items and their type.
-    keys = ['text', 'types', 'labels',
-            'is_random', 'loss_mask', 'padding_mask']
+    keys = ['text', 'types', 'labels', 'is_random', 'loss_mask', 'padding_mask']
     datatype = torch.int64
 
     # Broadcast data.
@@ -90,23 +94,19 @@ def loss_func(loss_mask, sentence_order, output_tensor):
 
     lm_loss_ = lm_loss_.float()
     loss_mask = loss_mask.float()
-    lm_loss = torch.sum(
-        lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
+    lm_loss = torch.sum(lm_loss_.view(-1) * loss_mask.reshape(-1)) / loss_mask.sum()
 
     if sop_logits is not None:
-        sop_loss = F.cross_entropy(sop_logits.view(-1, 2).float(),
-                                   sentence_order.view(-1),
-                                   ignore_index=-1)
+        sop_loss = F.cross_entropy(
+            sop_logits.view(-1, 2).float(), sentence_order.view(-1), ignore_index=-1
+        )
         sop_loss = sop_loss.float()
         loss = lm_loss + sop_loss
-        averaged_losses = average_losses_across_data_parallel_group(
-            [lm_loss, sop_loss])
-        return loss, {'lm loss': averaged_losses[0],
-                      'sop loss': averaged_losses[1]}
+        averaged_losses = average_losses_across_data_parallel_group([lm_loss, sop_loss])
+        return loss, {'lm loss': averaged_losses[0], 'sop loss': averaged_losses[1]}
     else:
         loss = lm_loss
-        averaged_losses = average_losses_across_data_parallel_group(
-            [lm_loss])
+        averaged_losses = average_losses_across_data_parallel_group([lm_loss])
         return loss, {'lm loss': averaged_losses[0]}
 
 
@@ -117,16 +117,14 @@ def forward_step(data_iterator, model):
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
-    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = get_batch(
-        data_iterator)
+    tokens, types, sentence_order, loss_mask, lm_labels, padding_mask = get_batch(data_iterator)
     timers('batch-generator').stop()
 
     if not args.bert_binary_head:
         types = None
 
     # Forward pass through the model.
-    output_tensor = model(tokens, padding_mask,
-                          tokentype_ids=types, lm_labels=lm_labels)
+    output_tensor = model(tokens, padding_mask, tokentype_ids=types, lm_labels=lm_labels)
 
     return output_tensor, partial(loss_func, loss_mask, sentence_order)
 
@@ -144,7 +142,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
         blend_per_split=[
             get_blend_from_list(args.train_data_path),
             get_blend_from_list(args.valid_data_path),
-            get_blend_from_list(args.test_data_path)
+            get_blend_from_list(args.test_data_path),
         ],
         split=args.split,
         path_to_cache=args.data_cache_path,
@@ -161,8 +159,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
         allow_ambiguous_pad_tokens=args.allow_ambiguous_pad_tokens,
     )
 
-    print_rank_0('> building train, validation, and test datasets '
-                 'for BERT ...')
+    print_rank_0('> building train, validation, and test datasets ' 'for BERT ...')
 
     train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
         BERTMaskedWordPieceDataset,
@@ -182,6 +179,9 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
     train_valid_test_datasets_provider.is_distributed = True
 
     parse_and_validate_args(args_defaults={'tokenizer_type': 'BertWordPieceLowerCase'})
-    pretrain(train_valid_test_datasets_provider, model_provider,
-             ModelType.encoder_or_decoder,
-             forward_step)
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_or_decoder,
+        forward_step,
+    )
diff --git a/pretrain_gpt.py b/pretrain_gpt.py
index 929a9d0f866..6850dd91c2b 100644
--- a/pretrain_gpt.py
+++ b/pretrain_gpt.py
@@ -4,6 +4,7 @@
 
 # Capture the true program start time BEFORE any heavy imports.
 import time
+
 _PROGRAM_START_TIME = time.time()
 
 import json
@@ -11,6 +12,7 @@
 # Suppress warnings on all ranks but rank 0.
 import os
 import warnings
+
 rank = int(os.environ.get('RANK', 0))
 if rank != 0:
     warnings.filterwarnings("ignore", category=UserWarning)
@@ -24,13 +26,19 @@
 from gpt_builders import gpt_builder
 from megatron.core import parallel_state
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.data_schedule import get_batch_on_this_rank_for_sequence_packing
 from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
 from megatron.core.enums import ModelType
-from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.models.gpt import GPTModel
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer
-from megatron.core.utils import get_attr_wrapped_model, get_thd_batch_on_this_cp_rank, get_batch_on_this_hybrid_cp_rank, StragglerDetector
+from megatron.core.transformer.multi_token_prediction import get_mtp_ranks, mtp_on_this_rank
+from megatron.core.utils import (
+    StragglerDetector,
+    get_attr_wrapped_model,
+    get_thd_batch_on_this_cp_rank,
+)
 from megatron.training import (
     get_args,
     get_timers,
@@ -39,10 +47,9 @@
     print_rank_0,
     set_startup_timestamps,
 )
-from megatron.training.datasets.sft_dataset import SFTDataset
-from megatron.core.transformer.multi_token_prediction import mtp_on_this_rank, get_mtp_ranks
 from megatron.training.arguments import core_transformer_config_from_args, parse_and_validate_args
 from megatron.training.datasets.fim_dataset import GPTFIMDataset, GPTFIMDatasetConfig
+from megatron.training.datasets.sft_dataset import MockSFTDataset, SFTDataset
 from megatron.training.utils import (
     get_batch_on_this_cp_rank,
     get_batch_on_this_tp_rank,
@@ -114,24 +121,35 @@ def get_batch(data_iterator, vp_stage: Optional[int] = None):
     """
     args = get_args()
     config = core_transformer_config_from_args(args)
+
+    if args.sequence_packing_scheduler is not None:
+        return get_batch_on_this_rank_for_sequence_packing(
+            data_iterator,
+            vpp_size=config.virtual_pipeline_model_parallel_size,
+            mtp_on_this_rank=mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage),
+            vp_stage=vp_stage,
+            dynamic_cp=args.dynamic_context_parallel,
+        )
+
     # TODO: this is pretty hacky, find a better way
     is_packed_sequence = get_args().sft  # SFT always uses packed sequence
-    if not is_first_or_last_pipeline_stage(vp_stage) and not is_packed_sequence and (
-    (not mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage))):
+    if (
+        not is_first_or_last_pipeline_stage(vp_stage)
+        and not is_packed_sequence
+        and ((not mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage)))
+    ):
         return None, None, None, None, None, None
 
     # get batches based on the TP rank you are on
     batch = get_batch_on_this_tp_rank(
         data_iterator,
-        mtp_on_this_rank=mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage)
-        )
+        mtp_on_this_rank=mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage),
+    )
 
     cu_seqlens = batch.pop('cu_seqlens', None)
     cu_seqlens_padded = batch.pop('cu_seqlens_padded', None)
     max_seqlen = batch.pop('max_seqlen', None)
-    local_cp_size = batch.pop('local_cp_size', None)
-    if local_cp_size is not None:
-        local_cp_size = int(local_cp_size.item())
+    batch.pop('local_cp_size', None)
 
     if cu_seqlens is not None:
         assert (
@@ -143,22 +161,29 @@ def get_batch(data_iterator, vp_stage: Optional[int] = None):
     # For middle pipeline stages with packed sequences, only cu_seqlens and
     # max_seqlen are needed (for attention masking); skip the full batch.
     if not is_first_or_last_pipeline_stage(vp_stage) and is_packed_sequence:
-        return None, None, None, None, None, PackedSeqParams(
-            cu_seqlens_q=cu_seqlens,
-            cu_seqlens_kv=cu_seqlens,
-            max_seqlen_q=int(max_seqlen[0].item()),
-            max_seqlen_kv=int(max_seqlen[0].item()),
-            qkv_format='thd',
+        return (
+            None,
+            None,
+            None,
+            None,
+            None,
+            PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                max_seqlen_q=int(max_seqlen[0].item()),
+                max_seqlen_kv=int(max_seqlen[0].item()),
+                qkv_format='thd',
+            ),
         )
 
-    if cu_seqlens is None and local_cp_size is None:
+    if cu_seqlens is None:
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)  # The implementation of this function is in MCore
         packed_seq_params = None
-    elif local_cp_size is None:  # Packed THD format
-        batch, packed_seq_params = get_thd_batch_on_this_cp_rank(batch, cu_seqlens, cu_seqlens_padded, max_seqlen)
-    else: # Hybrid CP format
-        batch, packed_seq_params = get_batch_on_this_hybrid_cp_rank(batch, local_cp_size)
+    else:  # Packed THD format
+        batch, packed_seq_params = get_thd_batch_on_this_cp_rank(
+            batch, cu_seqlens, cu_seqlens_padded, max_seqlen
+        )
 
     return (*batch.values(), packed_seq_params)
 
@@ -245,7 +270,9 @@ def forward_step(data_iterator, model: GPTModel, return_schedule_plan: bool = Fa
     global stimer
     with stimer(bdata=True):
         vp_stage = get_attr_wrapped_model(model, "vp_stage")
-        tokens, labels, loss_mask, attention_mask, position_ids, packed_seq_params = get_batch(data_iterator, vp_stage)
+        tokens, labels, loss_mask, attention_mask, position_ids, packed_seq_params = get_batch(
+            data_iterator, vp_stage
+        )
     timers('batch-generator').stop()
 
     with stimer:
@@ -253,15 +280,21 @@ def forward_step(data_iterator, model: GPTModel, return_schedule_plan: bool = Fa
             output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
         else:
             if return_schedule_plan:
-                assert args.overlap_moe_expert_parallel_comm, \
-                    "overlap_moe_expert_parallel_comm must be enabled to return the schedule plan"
+                assert (
+                    args.overlap_moe_expert_parallel_comm
+                ), "overlap_moe_expert_parallel_comm must be enabled to return the schedule plan"
                 schedule_plan = model.build_schedule_plan(
                     tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask
                 )
                 return schedule_plan, partial(loss_func, loss_mask, model=model)
             else:
                 output_tensor = model(
-                    tokens, position_ids, attention_mask, labels=labels, loss_mask=loss_mask, packed_seq_params=packed_seq_params
+                    tokens,
+                    position_ids,
+                    attention_mask,
+                    labels=labels,
+                    loss_mask=loss_mask,
+                    packed_seq_params=packed_seq_params,
                 )
 
     # [ModelOpt]: model is needed to access ModelOpt distillation losses
@@ -275,9 +308,8 @@ def is_dataset_built_on_rank(vp_stage=None, is_packed_sequence=False):
         return False
     elif is_packed_sequence:
         return True
-    return (
-        is_first_or_last_pipeline_stage(vp_stage)
-        or mtp_on_this_rank(config, ignore_virtual=False, vp_stage=vp_stage)
+    return is_first_or_last_pipeline_stage(vp_stage) or mtp_on_this_rank(
+        config, ignore_virtual=False, vp_stage=vp_stage
     )
 
 
@@ -318,8 +350,9 @@ def core_gpt_dataset_config_from_args(args):
         "defer_npy_index_mmap": args.dataloader_defer_npy_index_mmap,
         "context_parallel_size": args.context_parallel_size,
         "data_parallel_size": args.data_parallel_size,
-        "sequence_parallel_size": args.tensor_model_parallel_size*args.sequence_parallel,
-        "hybrid_context_parallel": args.hybrid_context_parallel,
+        "sequence_parallel_size": args.tensor_model_parallel_size * args.sequence_parallel,
+        "dynamic_context_parallel": args.dynamic_context_parallel,
+        "sft_mock_dataset_config_json": args.sft_mock_dataset_config_json,
     }
 
     # add FIM args to the config
@@ -356,10 +389,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
 
     config = core_gpt_dataset_config_from_args(args)
 
-
     is_packed_sequence = False
     if args.sft:
-        dataset_type = SFTDataset
+        if args.mock_data:
+            dataset_type = MockSFTDataset
+        else:
+            dataset_type = SFTDataset
         is_packed_sequence = True  # SFT always uses packed sequence
     else:
         if args.mock_data:
@@ -371,7 +406,9 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
 
     print_rank_0("> building train, validation, and test datasets for GPT ...")
 
-    is_dataset_built = partial(is_dataset_built_on_rank, vp_stage=vp_stage, is_packed_sequence=is_packed_sequence)
+    is_dataset_built = partial(
+        is_dataset_built_on_rank, vp_stage=vp_stage, is_packed_sequence=is_packed_sequence
+    )
     train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
         dataset_type, train_val_test_num_samples, is_dataset_built, config
     ).build()
diff --git a/pretrain_mamba.py b/pretrain_mamba.py
index 590eb92ab28..df709883a66 100644
--- a/pretrain_mamba.py
+++ b/pretrain_mamba.py
@@ -3,6 +3,7 @@
 
 # Capture the true program start time BEFORE any heavy imports.
 import time
+
 _PROGRAM_START_TIME = time.time()
 
 import json
@@ -10,6 +11,7 @@
 # Suppress warnings on all ranks but rank 0.
 import os
 import warnings
+
 rank = int(os.environ.get('RANK', 0))
 if rank != 0:
     warnings.filterwarnings("ignore", category=UserWarning)
@@ -25,15 +27,12 @@
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
 from megatron.core.enums import ModelType
-from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.core.parallel_state import (
-    get_context_parallel_rank,
-    get_context_parallel_world_size,
-)
 from megatron.core.models.mamba import MambaModel
+from megatron.core.packed_seq_params import PackedSeqParams
+from megatron.core.parallel_state import get_context_parallel_rank, get_context_parallel_world_size
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer
-from megatron.core.utils import get_attr_wrapped_model, is_te_min_version, StragglerDetector
+from megatron.core.utils import StragglerDetector, get_attr_wrapped_model, is_te_min_version
 from megatron.training import (
     get_args,
     get_timers,
@@ -55,6 +54,7 @@
 try:
     from megatron.post_training.arguments import add_modelopt_args
     from megatron.post_training.loss_func import loss_func as loss_func_modelopt
+
     has_nvidia_modelopt = True
 except ImportError:
     has_nvidia_modelopt = False
@@ -91,11 +91,11 @@ def get_batch(data_iterator, vp_stage=None):
         return empty_batch.values()
 
     batch = get_batch_on_this_tp_rank(data_iterator)
-    
+
     cu_seqlens = batch['cu_seqlens']
     # Unused at the moment
     cu_seqlens_padded = batch.pop('cu_seqlens_padded', None)
-    # Support for Hybrid Context Parallel (Unused in this script)
+    # Support for Dynamic Context Parallel (Unused in this script)
     local_cp_size = batch.pop('local_cp_size', None)
 
     if cu_seqlens is not None:
@@ -130,12 +130,7 @@ def get_batch(data_iterator, vp_stage=None):
                 "Context Parallel with THD format data"
             )
             cp_rank = get_context_parallel_rank()
-            index = tex.thd_get_partitioned_indices(
-                cu_seqlens,
-                total_tokens,
-                cp_size,
-                cp_rank,
-            )
+            index = tex.thd_get_partitioned_indices(cu_seqlens, total_tokens, cp_size, cp_rank)
             for key, data in batch.items():
                 if key in {'attention_mask', 'cu_seqlens', 'max_seqlen'}:
                     continue
@@ -150,7 +145,10 @@ def get_batch(data_iterator, vp_stage=None):
 # define spiky loss as a loss that's 10x the max loss observed
 SPIKY_LOSS_FACTOR = 10
 
-def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, model: Optional[MambaModel] = None):
+
+def loss_func(
+    loss_mask: torch.Tensor, output_tensor: torch.Tensor, model: Optional[MambaModel] = None
+):
     """Loss function.
 
     Args:
@@ -181,14 +179,14 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, model: Optio
             result=loss,
             rejection_func=torch.isnan,
             message="found NaN in local forward loss calculation",
-            tolerance=0.0,        # forward pass calculations are deterministic
+            tolerance=0.0,  # forward pass calculations are deterministic
             fatal=True,
         )
         rerun_state_machine.validate_result(
             result=loss,
             rejection_func=torch.isinf,
             message="found Inf in local forward loss calculation",
-            tolerance=0.0,        # forward pass calculations are deterministic
+            tolerance=0.0,  # forward pass calculations are deterministic
             fatal=True,
         )
     # Check for spiky loss
@@ -201,7 +199,7 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, model: Optio
                 context="loss",
             ),
             message="Spiky loss",
-            tolerance=0.0,        # forward pass calculations are deterministic
+            tolerance=0.0,  # forward pass calculations are deterministic
             fatal=False,
         )
 
@@ -224,15 +222,9 @@ def forward_step(data_iterator, model: MambaModel):
 
     with stimer(bdata=True):
         vp_stage = get_attr_wrapped_model(model, "vp_stage")
-        (
-            tokens,
-            labels,
-            loss_mask,
-            attention_mask,
-            position_ids,
-            cu_seqlens,
-            max_seqlen,
-        ) = get_batch(data_iterator, vp_stage)
+        (tokens, labels, loss_mask, attention_mask, position_ids, cu_seqlens, max_seqlen) = (
+            get_batch(data_iterator, vp_stage)
+        )
 
     if cu_seqlens is None:
         packed_seq_params = None
@@ -258,7 +250,7 @@ def forward_step(data_iterator, model: MambaModel):
             attention_mask,
             labels=labels,
             packed_seq_params=packed_seq_params,
-            loss_mask=loss_mask
+            loss_mask=loss_mask,
         )
 
     # [ModelOpt]: model is needed to access ModelOpt distillation losses
@@ -336,7 +328,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
         dataset_type,
         train_val_test_num_samples,
         partial(is_dataset_built_on_rank, vp_stage=vp_stage, is_packed_sequence=is_packed_sequence),
-        config
+        config,
     ).build()
 
     print_rank_0("> finished creating GPT datasets ...")
@@ -361,9 +353,10 @@ def train_valid_test_datasets_provider(train_val_test_num_samples, vp_stage=None
         extra_args_provider=add_modelopt_args if has_nvidia_modelopt else None,
         args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
     )
-    pretrain(train_valid_test_datasets_provider,
-             partial(model_provider, mamba_builder),
-             ModelType.encoder_or_decoder,
-             forward_step,
-             store=store,
-             )
+    pretrain(
+        train_valid_test_datasets_provider,
+        partial(model_provider, mamba_builder),
+        ModelType.encoder_or_decoder,
+        forward_step,
+        store=store,
+    )
diff --git a/pretrain_t5.py b/pretrain_t5.py
index 2b10fa3ffe3..59918930a78 100644
--- a/pretrain_t5.py
+++ b/pretrain_t5.py
@@ -10,7 +10,6 @@
 
 import megatron
 from megatron.core import mpu, tensor_parallel
-from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer
 from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
 from megatron.core.datasets.t5_dataset import (
     T5MaskedWordPieceDataset,
@@ -25,6 +24,7 @@
     get_t5_encoder_with_local_block_spec,
     get_t5_encoder_with_transformer_engine_block_spec,
 )
+from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer
 from megatron.training import get_args, get_timers, pretrain, print_rank_0
 from megatron.training.arguments import core_transformer_config_from_args, parse_and_validate_args
 from pretrain_gpt import loss_func
@@ -86,7 +86,7 @@ def model_provider(
     """
 
     args = get_args()
-    
+
     if config is None:
         config = core_transformer_config_from_args(args)
 
diff --git a/pretrain_vlm.py b/pretrain_vlm.py
index a6aef770002..720094b5ec7 100644
--- a/pretrain_vlm.py
+++ b/pretrain_vlm.py
@@ -7,19 +7,11 @@
 import torch
 
 from megatron.core import mpu, parallel_state, tensor_parallel
-from megatron.core.datasets.blended_megatron_dataset_builder import (
-    BlendedMegatronDatasetBuilder,
-)
-from megatron.core.datasets.multimodal_dataset import (
-    MockMultimodalDataset,
-    MultimodalDatasetConfig,
-)
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.multimodal_dataset import MockMultimodalDataset, MultimodalDatasetConfig
 from megatron.core.enums import ModelType
 from megatron.core.models.multimodal import context_parallel
-from megatron.core.models.multimodal.llava_model import (
-    DEFAULT_IMAGE_TOKEN_INDEX,
-    LLaVAModel,
-)
+from megatron.core.models.multimodal.llava_model import DEFAULT_IMAGE_TOKEN_INDEX, LLaVAModel
 from megatron.core.models.multimodal.llava_spec import (
     decoder_model_with_local_default_spec,
     decoder_model_with_transformer_engine_default_spec,
@@ -31,13 +23,7 @@
 )
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.transformer.spec_utils import import_module
-from megatron.training import (
-    get_args,
-    get_timers,
-    get_tokenizer,
-    pretrain,
-    print_rank_0,
-)
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain, print_rank_0
 from megatron.training.arguments import core_transformer_config_from_args, parse_and_validate_args
 from pretrain_gpt import loss_func
 
@@ -70,13 +56,23 @@ def model_provider(
     args = get_args()
     vision_model_type = "clip"
 
-    assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."
-    assert not (args.context_parallel_size > 1 and args.pipeline_model_parallel_size > 1), "PP+CP is not yet supported by this script. \
+    assert (
+        args.ckpt_format == 'torch'
+    ), "Only ckpt-format torch is supported for VLM training currently."
+    assert not (
+        args.context_parallel_size > 1 and args.pipeline_model_parallel_size > 1
+    ), "PP+CP is not yet supported by this script. \
     Current mock dataset does not support natively packed sequence dataset required for correct PP comm shapes."
 
     num_image_embeddings = get_num_image_embeddings(
-        args.img_h, args.img_w, args.patch_dim, vision_model_type, args.disable_vision_class_token,
-        class_token_len=1, pixel_shuffle=False, use_tile_tags=False
+        args.img_h,
+        args.img_w,
+        args.patch_dim,
+        vision_model_type,
+        args.disable_vision_class_token,
+        class_token_len=1,
+        pixel_shuffle=False,
+        use_tile_tags=False,
     )
 
     old_seq_length = args.seq_length
@@ -99,7 +95,7 @@ def model_provider(
         args.tensor_model_parallel_size,
         args.sequence_parallel,
         args.decoder_tp_comm_overlap,
-        args.decoder_seq_length
+        args.decoder_seq_length,
     )
     args.decoder_seq_length = decoder_seq_len + mp_padding_needed
 
@@ -115,8 +111,9 @@ def model_provider(
     else:
         language_transformer_config.num_layers = args.num_layers
     if args.decoder_tp_comm_overlap:
-        assert args.transformer_impl == "transformer_engine", \
-            "TransformerEngine is needed to support Decoder TP Comm overlap"
+        assert (
+            args.transformer_impl == "transformer_engine"
+        ), "TransformerEngine is needed to support Decoder TP Comm overlap"
         language_transformer_config.tp_comm_overlap = args.decoder_tp_comm_overlap
 
     if args.spec is not None:
@@ -132,10 +129,24 @@ def model_provider(
 
     # Prepare mask type for any required padding to support CP/SP sequence sharding.
     if mp_padding_needed > 0:
-        if language_transformer_layer_spec.submodules.self_attention.params.get('attn_mask_type', '') == AttnMaskType.causal:
-            language_transformer_layer_spec.submodules.self_attention.params['attn_mask_type'] = AttnMaskType.padding_causal
-        elif language_transformer_layer_spec.submodules.self_attention.params.get('attn_mask_type', '') == AttnMaskType.no_mask:
-            language_transformer_layer_spec.submodules.self_attention.params['attn_mask_type'] = AttnMaskType.padding
+        if (
+            language_transformer_layer_spec.submodules.self_attention.params.get(
+                'attn_mask_type', ''
+            )
+            == AttnMaskType.causal
+        ):
+            language_transformer_layer_spec.submodules.self_attention.params['attn_mask_type'] = (
+                AttnMaskType.padding_causal
+            )
+        elif (
+            language_transformer_layer_spec.submodules.self_attention.params.get(
+                'attn_mask_type', ''
+            )
+            == AttnMaskType.no_mask
+        ):
+            language_transformer_layer_spec.submodules.self_attention.params['attn_mask_type'] = (
+                AttnMaskType.padding
+            )
 
     if args.transformer_impl == "transformer_engine":
         vision_transformer_layer_spec = get_vit_layer_with_transformer_engine_spec()
@@ -148,7 +159,7 @@ def model_provider(
     vision_transformer_config.first_pipeline_num_layers = None
     vision_transformer_config.last_pipeline_num_layers = None
     vision_transformer_config.vision_model_type = vision_model_type
-    vision_transformer_config.context_parallel_size = 1 # Force CP=1 for Vision Transformer
+    vision_transformer_config.context_parallel_size = 1  # Force CP=1 for Vision Transformer
     if vision_transformer_config.sequence_parallel:
         print_rank_0("> Disabling Sequence parallelism in Vision Transformer. Not yet supported")
         vision_transformer_config.sequence_parallel = False
@@ -158,7 +169,7 @@ def model_provider(
 
     vision_projection_type = "mlp"
     vision_projection_config = deepcopy(language_transformer_config)
-    vision_projection_config.context_parallel_size = 1 # Force CP=1 for Vision Projection
+    vision_projection_config.context_parallel_size = 1  # Force CP=1 for Vision Projection
     if vision_projection_config.sequence_parallel:
         print_rank_0("> Disabling Sequence parallelism in Vision Projection. Not yet supported")
         vision_projection_config.sequence_parallel = False
@@ -320,41 +331,47 @@ def get_batch(data_iterator):
         vision_model_type = "clip"
         # Calculate the number of image embedding tokens will be added to text tokens
         num_image_embeddings_per_tile = get_num_image_embeddings(
-            args.img_h, args.img_w, args.patch_dim, vision_model_type,
-            args.disable_vision_class_token, 1, False
+            args.img_h,
+            args.img_w,
+            args.patch_dim,
+            vision_model_type,
+            args.disable_vision_class_token,
+            1,
+            False,
         )
         # Pad to make sure the text sequence can be sharded equally by CP chunks.
         image_token_mask = tokens == DEFAULT_IMAGE_TOKEN_INDEX
         num_images_per_sample = torch.sum(image_token_mask, dim=-1)
-        img_seq_len = (num_image_embeddings_per_tile * num_images_per_sample - num_images_per_sample).max()
+        img_seq_len = (
+            num_image_embeddings_per_tile * num_images_per_sample - num_images_per_sample
+        ).max()
         mp_padding_needed_for_text = context_parallel.get_padding(
             tokens.shape[1] + img_seq_len,
             args.context_parallel_size,
             args.tensor_model_parallel_size,
             args.sequence_parallel,
             args.decoder_tp_comm_overlap,
-            args.decoder_seq_length
+            args.decoder_seq_length,
         )
         if mp_padding_needed_for_text > 0:
-            tokens, position_ids, labels, loss_mask = [torch.nn.functional.pad(item, (0, mp_padding_needed_for_text)) for item in (tokens, position_ids, labels, loss_mask)]
-        packed_seq_params = context_parallel.get_packed_seq_params(tokens, img_seq_len, mp_padding_needed_for_text, cp_size, args.use_packed_sequence)
+            tokens, position_ids, labels, loss_mask = [
+                torch.nn.functional.pad(item, (0, mp_padding_needed_for_text))
+                for item in (tokens, position_ids, labels, loss_mask)
+            ]
+        packed_seq_params = context_parallel.get_packed_seq_params(
+            tokens, img_seq_len, mp_padding_needed_for_text, cp_size, args.use_packed_sequence
+        )
 
         if packed_seq_params.qkv_format == 'thd':
             # Reshape from [B,S] to [T,1]
-            tokens = (
-                tokens.contiguous()
-                .view(tokens.shape[0] * tokens.shape[1])
-                .unsqueeze(0)
-            )
+            tokens = tokens.contiguous().view(tokens.shape[0] * tokens.shape[1]).unsqueeze(0)
             position_ids = (
                 position_ids.contiguous()
                 .view(position_ids.shape[0] * position_ids.shape[1])
                 .unsqueeze(0)
             )
             labels = labels.view(labels.shape[0] * labels.shape[1]).unsqueeze(0)
-            loss_mask = loss_mask.view(
-                loss_mask.shape[0] * loss_mask.shape[1]
-            ).unsqueeze(0)
+            loss_mask = loss_mask.view(loss_mask.shape[0] * loss_mask.shape[1]).unsqueeze(0)
 
     attention_mask = None  # Use the attention mask type defined in layer spec. Typically no mask for the vision model and causal mask for the vision model.
 
@@ -376,11 +393,19 @@ def forward_step(data_iterator, model: LLaVAModel):
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
-    tokens, position_ids, labels, images, loss_mask, attention_mask, packed_seq_params = get_batch(data_iterator)
+    tokens, position_ids, labels, images, loss_mask, attention_mask, packed_seq_params = get_batch(
+        data_iterator
+    )
     timers('batch-generator').stop()
 
     output_tensor, loss_mask = model(
-        images, tokens, position_ids, attention_mask, labels, loss_mask, packed_seq_params=packed_seq_params
+        images,
+        tokens,
+        position_ids,
+        attention_mask,
+        labels,
+        loss_mask,
+        packed_seq_params=packed_seq_params,
     )
 
     return output_tensor, partial(loss_func, loss_mask)
@@ -401,15 +426,21 @@ def add_vlm_extra_args(parser):
         default=False,
         help="Drop vision model class token",
     )
-    group.add_argument("--dataloader-seq-length", type=int, help="Make dataloader to produce sequences of specific length.")
-    group.add_argument("--decoder-tp-comm-overlap", action="store_true", default=False, help="Enables the overlap of "
-                        "Tensor parallel communication and GEMM kernels in Decoder only. "
-                        "Please provide decoder-seq-length when using this feature.")
     group.add_argument(
-        "--use-packed-sequence",
+        "--dataloader-seq-length",
+        type=int,
+        help="Make dataloader to produce sequences of specific length.",
+    )
+    group.add_argument(
+        "--decoder-tp-comm-overlap",
         action="store_true",
         default=False,
-        help="Use packed sequence",
+        help="Enables the overlap of "
+        "Tensor parallel communication and GEMM kernels in Decoder only. "
+        "Please provide decoder-seq-length when using this feature.",
+    )
+    group.add_argument(
+        "--use-packed-sequence", action="store_true", default=False, help="Use packed sequence"
     )
     return parser
 
@@ -440,8 +471,7 @@ def llava_position_embedding_ranks(pp_ranks):
     train_valid_test_datasets_provider.is_distributed = True
 
     parse_and_validate_args(
-        extra_args_provider=add_vlm_extra_args,
-        args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
+        extra_args_provider=add_vlm_extra_args, args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}
     )
     pretrain(
         train_valid_test_datasets_provider,
diff --git a/pyproject.toml b/pyproject.toml
index 5b09cbd2a66..a3dbde52456 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,12 +8,7 @@ build-backend = "setuptools.build_meta"
 include-package-data = true
 
 [tool.setuptools.packages.find]
-include = [
-    "megatron.core",
-    "megatron.core.*",
-    "megatron.training",
-    "megatron.training.*",
-]
+include = ["megatron.core", "megatron.core.*", "megatron.training", "megatron.training.*"]
 
 [tool.setuptools.dynamic]
 version = { attr = "megatron.core.package_info.__version__" }
@@ -23,7 +18,7 @@ readme = { file = "README.md", content-type = "text/markdown" }
 name = "megatron-core"
 dynamic = ["version", "readme"]
 description = "Megatron Core - a library for efficient and scalable training of transformer based models"
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 license = { text = "Apache 2.0" }
 dependencies = ["torch>=2.6.0", "numpy", "packaging>=24.2"]
 authors = [{ name = "NVIDIA", email = "nemo-toolkit@nvidia.com" }]
@@ -89,7 +84,7 @@ mlm = [
 
 dev = [
     "nvidia-modelopt[torch]; sys_platform != 'darwin'",
-    "transformer-engine[pytorch,core_cu13]",
+    "transformer-engine[pytorch,core_cu13]>=2.9.0a0,<2.12.0",
     "nvidia-resiliency-ext",
     "tqdm",
     "einops~=0.8",
@@ -100,18 +95,18 @@ dev = [
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
     "flash-linear-attention~=0.4.0",
+    "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
     "av",
     "flashinfer-python~=0.5.0",
     "wget",
     "onnxscript",
-    "fastapi~=0.50",                                    # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
+    "fastapi~=0.50",                                          # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
     "datasets",
-    "emerging_optimizers",
-    "hypercorn",
+    "emerging_optimizers; python_version >= '3.12'",
     "quart",
+    "hypercorn",
     "openai[aiohttp]",
-    "orjson",
 ]
 
 lts = [
@@ -123,6 +118,7 @@ lts = [
     "opentelemetry-api~=1.33.1",
     "mamba-ssm~=2.2",
     "causal-conv1d~=1.5",
+    "nv-grouped-gemm~=1.1",
     "megatron-energon[av_decode]~=6.0",
     "av",
     "flashinfer-python~=0.5.0",
@@ -130,7 +126,7 @@ lts = [
     "onnxscript",
     "fastapi~=0.50",                      # Forcing a little bit more recent version of fastapi to be compatible with pydantic 2.0
     "datasets",
-    "emerging_optimizers",
+    "emerging_optimizers; python_version >= '3.12'",
 ]
 
 [dependency-groups]
@@ -140,7 +136,6 @@ test = [
     "wrapt",
     "pytest==8.3.5",
     "pytest-mock",
-    "mock",
     "pytest-cov",
     "pytest-random-order",
     "pytest-asyncio",
@@ -175,16 +170,17 @@ linting = [
     "pylint==3.2.6",
 ]
 ci = ["python-gitlab", "slack-sdk", "pandas"]
-no_pypi_wheels = ["flash_mla", "emerging_optimizers"]
+no_pypi_wheels = ["emerging_optimizers; python_version >= '3.12'", "fast-hadamard-transform"]
 
 [tool.uv]
 default-groups = ["linting", "build", "test"]
 no-build-isolation-package = [
     "causal-conv1d",
-    "flash_mla",
+    "nv-grouped-gemm",
     "mamba-ssm",
     "transformer-engine",
     "transformer-engine-torch",
+    "fast-hadamard-transform",
 ]
 link-mode = "copy"
 conflicts = [[{ extra = "lts" }, { extra = "dev" }]]
@@ -205,7 +201,8 @@ flash_mla = [
 transformer-engine = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "f031cf87bd054c7558b887df7bed93975456667f" }
 nemo-run = { git = "https://github.com/NVIDIA-NeMo/Run.git", rev = "17ae86b64d7f75653351664f5d8c9e466faede00" }
 emerging_optimizers = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git", rev = "v0.2.0" }
-nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "b2bb3d728a18795807d9f76c535e005a609a1b01" }
+fast-hadamard-transform = { git = "https://github.com/Dao-AILab/fast-hadamard-transform.git", rev = "f134af63deb2df17e1171a9ec1ea4a7d8604d5ca" }
+nvidia-resiliency-ext = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git", rev = "15a851565a4ce846c04431ecb0cf09903ab4837e" }
 
 [tool.isort]
 profile = "black"                                                          # black-compatible
@@ -234,7 +231,7 @@ exclude = '''
 '''
 
 [tool.pytest.ini_options]
-addopts = "--durations=15 -s -rA"
+addopts = "--durations=15 -s -rA -x"
 testpaths = ["tests"]
 python_files = "test_*.py"
 markers = [
@@ -247,10 +244,8 @@ markers = [
 concurrency = ["thread", "multiprocessing"]
 omit = [
     "/tmp/*",
-    "/opt/megatron-lm/tests/*",
-    "/opt/megatron-lm/tools/*",
+    "/workspace/tests/*",
     "/usr/local/lib/python3.12/dist-packages/*",
-    "/opt/megatron-lm/_remote_module_non_scriptable",
 ]
 parallel = true
 sigterm = false
diff --git a/tasks/finetune_utils.py b/tasks/finetune_utils.py
index 9507ba5dbf0..faf3ae9c96f 100644
--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
@@ -2,25 +2,27 @@
 
 """Finetune utilities."""
 
-from functools import partial
 import sys
+from functools import partial
+
 import torch
 
-from megatron.training import get_args
-from megatron.core.num_microbatches_calculator import get_num_microbatches
-from megatron.training import print_rank_0
-from megatron.training import get_timers
 from megatron.core import mpu
 from megatron.core.enums import ModelType
-from megatron.training.checkpointing import load_checkpoint
-from megatron.training.checkpointing import save_checkpoint
-from megatron.training.training import evaluate_and_print_results
-from megatron.training.training import setup_model_and_optimizer
-from megatron.training.training import train_step
-from megatron.training.training import training_log
-from megatron.training.utils import average_losses_across_data_parallel_group
-from megatron.training.utils import calc_params_l2_norm
-from megatron.training.utils import check_adlr_autoresume_termination
+from megatron.core.num_microbatches_calculator import get_num_microbatches
+from megatron.training import get_args, get_timers, print_rank_0
+from megatron.training.checkpointing import load_checkpoint, save_checkpoint
+from megatron.training.training import (
+    evaluate_and_print_results,
+    setup_model_and_optimizer,
+    train_step,
+    training_log,
+)
+from megatron.training.utils import (
+    average_losses_across_data_parallel_group,
+    calc_params_l2_norm,
+    check_adlr_autoresume_termination,
+)
 
 
 def process_batch(batch):
@@ -69,25 +71,27 @@ def _cross_entropy_forward_step(batch, model):
     return output_tensor, partial(cross_entropy_loss_func, labels)
 
 
-def build_data_loader(dataset, micro_batch_size, num_workers, drop_last,
-        task_collate_fn=None):
+def build_data_loader(dataset, micro_batch_size, num_workers, drop_last, task_collate_fn=None):
     """Data loader. Note that batch-size is the local (per GPU) batch-size."""
 
     # Sampler.
     world_size = mpu.get_data_parallel_world_size()
     rank = mpu.get_data_parallel_rank()
     sampler = torch.utils.data.distributed.DistributedSampler(
-        dataset, num_replicas=world_size, rank=rank)
+        dataset, num_replicas=world_size, rank=rank
+    )
 
     # Data loader. Note that batch size is the per GPU batch size.
-    data_loader = torch.utils.data.DataLoader(dataset,
-                                              batch_size=micro_batch_size,
-                                              sampler=sampler,
-                                              shuffle=False,
-                                              num_workers=num_workers,
-                                              drop_last=drop_last,
-                                              pin_memory=True,
-                                              collate_fn=task_collate_fn)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=micro_batch_size,
+        sampler=sampler,
+        shuffle=False,
+        num_workers=num_workers,
+        drop_last=drop_last,
+        pin_memory=True,
+        collate_fn=task_collate_fn,
+    )
 
     return data_loader
 
@@ -103,24 +107,27 @@ def _build_infinite_size_dataloader(dataloader):
             iterator = dataloader.__iter__()
 
 
-def _build_train_valid_dataloaders(train_dataset, valid_dataset, 
-    task_collate_fn=None):
+def _build_train_valid_dataloaders(train_dataset, valid_dataset, task_collate_fn=None):
     """Traing and validation dataloaders."""
     args = get_args()
 
     print_rank_0('building train and validation dataloaders ...')
     # Training dataset.
-    train_dataloader = build_data_loader(train_dataset, args.micro_batch_size,
-                                         args.num_workers, not args.keep_last,
-                                         task_collate_fn)
+    train_dataloader = build_data_loader(
+        train_dataset, args.micro_batch_size, args.num_workers, not args.keep_last, task_collate_fn
+    )
     # Set the training iterations.
     args.train_iters_per_epoch = len(train_dataloader)
     args.train_iters = args.epochs * args.train_iters_per_epoch
     # Validation dataset. For this dataset, we do not need to set up
     # shuffling so we can just use a simple infinite loop.
-    valid_dataloader_ = build_data_loader(valid_dataset, args.eval_micro_batch_size,
-                                          args.num_workers, not args.keep_last,
-                                          task_collate_fn)
+    valid_dataloader_ = build_data_loader(
+        valid_dataset,
+        args.eval_micro_batch_size,
+        args.num_workers,
+        not args.keep_last,
+        task_collate_fn,
+    )
     valid_dataloader = _build_infinite_size_dataloader(valid_dataloader_)
 
     # Now that we've built the data loaders, set batch_size arguments
@@ -144,13 +151,22 @@ def _build_train_valid_dataloaders(train_dataset, valid_dataset,
     return train_dataloader, valid_dataloader
 
 
-def _train(model, optimizer, opt_param_scheduler, forward_step,
-           train_dataloader, valid_dataloader, end_of_epoch_callback):
+def _train(
+    model,
+    optimizer,
+    opt_param_scheduler,
+    forward_step,
+    train_dataloader,
+    valid_dataloader,
+    end_of_epoch_callback,
+):
     """Train the model."""
     args = get_args()
     timers = get_timers()
 
-    assert get_num_microbatches() == 1, "finetuning with gradient accumulation doesn't currently work"
+    assert (
+        get_num_microbatches() == 1
+    ), "finetuning with gradient accumulation doesn't currently work"
 
     # Turn on training mode which enables dropout.
     for m in model:
@@ -194,33 +210,39 @@ def _train(model, optimizer, opt_param_scheduler, forward_step,
             params_norm = None
             if args.log_params_norm:
                 params_norm = calc_params_l2_norm(model)
-            report_memory_flag = training_log(losses_dict, losses_dict_sum,
-                                              optimizer.param_groups[0]['lr'],
-                                              iteration,
-                                              optimizer.get_loss_scale().item(),
-                                              report_memory_flag, skipped_iter,
-                                              grad_norm, params_norm, num_zeros_in_grad)
+            report_memory_flag = training_log(
+                losses_dict,
+                losses_dict_sum,
+                optimizer.param_groups[0]['lr'],
+                iteration,
+                optimizer.get_loss_scale().item(),
+                report_memory_flag,
+                skipped_iter,
+                grad_norm,
+                params_norm,
+                num_zeros_in_grad,
+            )
 
             # Autoresume
-            if args.adlr_autoresume and \
-               (iteration % args.adlr_autoresume_interval == 0):
-                check_adlr_autoresume_termination(iteration, model,
-                                                  optimizer, opt_param_scheduler)
+            if args.adlr_autoresume and (iteration % args.adlr_autoresume_interval == 0):
+                check_adlr_autoresume_termination(iteration, model, optimizer, opt_param_scheduler)
 
             # Checkpointing
             saved_checkpoint = False
-            if args.save and args.save_interval and \
-               iteration % args.save_interval == 0:
+            if args.save and args.save_interval and iteration % args.save_interval == 0:
                 save_checkpoint(iteration, model, optimizer, opt_param_scheduler)
                 saved_checkpoint = True
 
             # Evaluation
-            if args.eval_interval and iteration % args.eval_interval == 0 \
-                    and (args.start_eval_at_iter is None or iteration >= args.start_eval_at_iter):
+            if (
+                args.eval_interval
+                and iteration % args.eval_interval == 0
+                and (args.start_eval_at_iter is None or iteration >= args.start_eval_at_iter)
+            ):
                 prefix = 'iteration {}'.format(iteration)
-                evaluate_and_print_results(prefix, forward_step,
-                                           valid_dataloader, model,
-                                           iteration, None, False)
+                evaluate_and_print_results(
+                    prefix, forward_step, valid_dataloader, model, iteration, None, False
+                )
 
             # Exiting based on iterations
             if args.exit_interval and iteration % args.exit_interval == 0:
@@ -239,11 +261,14 @@ def _train(model, optimizer, opt_param_scheduler, forward_step,
             end_of_epoch_callback(model, epoch)
 
 
-def finetune(train_valid_datasets_provider, model_provider,
-             model_type=ModelType.encoder_or_decoder,
-             forward_step=_cross_entropy_forward_step,
-             end_of_epoch_callback_provider=None,
-             task_collate_fn=None):
+def finetune(
+    train_valid_datasets_provider,
+    model_provider,
+    model_type=ModelType.encoder_or_decoder,
+    forward_step=_cross_entropy_forward_step,
+    end_of_epoch_callback_provider=None,
+    task_collate_fn=None,
+):
     """Main finetune function used across all tasks."""
     args = get_args()
     timers = get_timers()
@@ -253,7 +278,8 @@ def finetune(train_valid_datasets_provider, model_provider,
     if args.epochs > 0:
         train_dataset, valid_dataset = train_valid_datasets_provider()
         train_dataloader, valid_dataloader = _build_train_valid_dataloaders(
-            train_dataset, valid_dataset, task_collate_fn)
+            train_dataset, valid_dataset, task_collate_fn
+        )
     else:
         args.train_iters = 0
     timers('train/valid/test dataset/dataloder').stop()
@@ -289,14 +315,28 @@ def finetune(train_valid_datasets_provider, model_provider,
 
     # Print setup timing.
     print_rank_0('done with setups ...')
-    timers.log(['train/valid/test dataset/dataloder', 'callback function',
-                'model and optimizer', 'pretrained checkpoint'], barrier=True)
+    timers.log(
+        [
+            'train/valid/test dataset/dataloder',
+            'callback function',
+            'model and optimizer',
+            'pretrained checkpoint',
+        ],
+        barrier=True,
+    )
     print_rank_0('training ...')
 
     # Finetune the model.
     if args.epochs > 0:
-        _train(model, optimizer, opt_param_scheduler, forward_step,
-               train_dataloader, valid_dataloader, end_of_epoch_callback)
+        _train(
+            model,
+            optimizer,
+            opt_param_scheduler,
+            forward_step,
+            train_dataloader,
+            valid_dataloader,
+            end_of_epoch_callback,
+        )
     # Or just evaluate.
     else:
         if end_of_epoch_callback is not None:
diff --git a/tests/functional_tests/shell_test_utils/run_ci_test.sh b/tests/functional_tests/shell_test_utils/run_ci_test.sh
index 3d47e591749..fa3ed2f4db9 100644
--- a/tests/functional_tests/shell_test_utils/run_ci_test.sh
+++ b/tests/functional_tests/shell_test_utils/run_ci_test.sh
@@ -149,6 +149,10 @@ for i in $(seq 1 $N_REPEAT); do
 
     # First run never loads from a checkpoint
     export RUN_NUMBER=1
+    DIR=$(dirname "$_TENSORBOARD_PATH")
+    FILE=$(basename "$_TENSORBOARD_PATH")
+    export TENSORBOARD_PATH=$DIR/$i/$FILE
+    mkdir -p $(dirname $TENSORBOARD_PATH)
     export REPEAT=$i
     export CHECKPOINT_SAVE_PATH=$_CHECKPOINT_SAVE_PATH
     export TRAINING_EXIT_CODE=0
diff --git a/tests/functional_tests/test_cases/ci_base_config.yml b/tests/functional_tests/test_cases/ci_base_config.yml
new file mode 100644
index 00000000000..739f343da9d
--- /dev/null
+++ b/tests/functional_tests/test_cases/ci_base_config.yml
@@ -0,0 +1,14 @@
+MODEL_ARGS:
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --logging-level: 40
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  # Add checkpointing args
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
index 4d686aba694..5c70d072482 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml
@@ -69,7 +69,6 @@ MODEL_ARGS:
   --eval-iters: 32
   --eval-interval: 2000
   # Add checkpointing args
-  --load: ${CHECKPOINT_LOAD_PATH}
   --save: ${CHECKPOINT_SAVE_PATH}
   --save-interval: 1000
   --save-retain-interval: 5000
@@ -89,7 +88,7 @@ MODEL_ARGS:
   --wandb-exp-name: ${WANDB_EXPERIMENT}
   # Add mixed precision args
   --bf16: true
-  --exit-interval: 13000
+  --exit-interval: 10200
   --wandb-save-dir: ${WANDB_SAVE_PATH}
   --async-save: true
   --use-persistent-ckpt-worker: true
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json
index 1ba701443ce..7fa302274bf 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_resume_torch_dist_tunable_overlap/golden_values_lts_dgxa100_dracooci.json
@@ -534,4 +534,4 @@
             "100": 0.16898
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json
index d8ec5426bd1..363e94d8f52 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap/golden_values_lts_dgxa100_dracooci.json
@@ -284,4 +284,4 @@
             "50": 0.16165
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json
index bdf7634a86e..22edca3da6a 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp1_pp4_vp2_account_for_embedding_loss_in_pipeline_split/golden_values_dev_dgx_gb200.json
@@ -175,7 +175,7 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 2610027008.0,
+            "1": 2610026496.0,
             "2": 2842349056.0,
             "3": 2842349056.0,
             "4": 2842349056.0,
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp4_dcp/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp4_dcp/model_config.yaml
new file mode 100644
index 00000000000..e1d7fda9e53
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp1_cp4_dcp/model_config.yaml
@@ -0,0 +1,61 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 1
+  ENABLE_LIGHTWEIGHT_MODE: true
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --moe-token-dispatcher-type: alltoall
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 2048
+  --max-position-embeddings: 2048
+  --train-iters: 50
+  --timing-log-level: 0
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --sft: true
+  --mock-data: true
+  --sft-mock-dataset-config-json: '\''{\"mode\":\"distribution\",\"type\":\"lognormal\",\"format\":\"thd\",\"min_seq_len\":128,\"max_seq_len\":2048,\"mean_seq_len\":512,\"lognormal_sigma\":1.2}\'''
+  --tokenizer-type: GPTSentencePieceTokenizer
+  --tokenizer-model: ${DATA_PATH}/text/slimpajama/preprocessed/mixtral_benchmarking/tokenizer.model
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 10000
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 1
+  --context-parallel-size: 4
+  --sequence-parallel: true
+  --dynamic-context-parallel: true
+  --calculate-per-token-loss: true
+  --min-dynamic-context-parallel-size: 1
+  --max-seqlen-per-dp-cp-rank: 512
+  --dataloader-type: single
+  --hidden-dropout: 0.0
+  --attention-dropout: 0.0
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --dist-ckpt-strictness: log_all
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --attention-backend: flash
+  --log-memory-to-tensorboard: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
index 0ebcb2e160d..c22e09c1ac6 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_cp2_etp4_nondeterministic_dp_last/golden_values_lts_dgx_a100.json
@@ -118,56 +118,56 @@
         "end_step": 50,
         "step_interval": 1,
         "values": {
-            "1": 431522304.0,
-            "2": 431522304.0,
-            "3": 431522304.0,
-            "4": 431522304.0,
-            "5": 431522304.0,
-            "6": 431522304.0,
-            "7": 431522304.0,
-            "8": 431522304.0,
-            "9": 431522304.0,
-            "10": 431522304.0,
-            "11": 431522304.0,
-            "12": 431522304.0,
-            "13": 431522304.0,
-            "14": 431522304.0,
-            "15": 431522304.0,
-            "16": 431522304.0,
-            "17": 431522304.0,
-            "18": 431522304.0,
-            "19": 431522304.0,
-            "20": 431522304.0,
-            "21": 431522304.0,
-            "22": 431522304.0,
-            "23": 431522304.0,
-            "24": 431522304.0,
-            "25": 431522304.0,
-            "26": 431522304.0,
-            "27": 431522304.0,
-            "28": 431522304.0,
-            "29": 431522304.0,
-            "30": 431522304.0,
-            "31": 431522304.0,
-            "32": 431522304.0,
-            "33": 431522304.0,
-            "34": 431522304.0,
-            "35": 431522304.0,
-            "36": 431522304.0,
-            "37": 431522304.0,
-            "38": 431522304.0,
-            "39": 431522304.0,
-            "40": 431522304.0,
-            "41": 431522304.0,
-            "42": 431522304.0,
-            "43": 431522304.0,
-            "44": 431522304.0,
-            "45": 431522304.0,
-            "46": 431522304.0,
-            "47": 431522304.0,
-            "48": 431522304.0,
-            "49": 431522304.0,
-            "50": 431522304.0
+            "1": 433619456.0,
+            "2": 433619456.0,
+            "3": 433619456.0,
+            "4": 433619456.0,
+            "5": 433619456.0,
+            "6": 433619456.0,
+            "7": 433619456.0,
+            "8": 433619456.0,
+            "9": 433619456.0,
+            "10": 433619456.0,
+            "11": 433619456.0,
+            "12": 433619456.0,
+            "13": 433619456.0,
+            "14": 433619456.0,
+            "15": 433619456.0,
+            "16": 433619456.0,
+            "17": 433619456.0,
+            "18": 433619456.0,
+            "19": 433619456.0,
+            "20": 433619456.0,
+            "21": 433619456.0,
+            "22": 433619456.0,
+            "23": 433619456.0,
+            "24": 433619456.0,
+            "25": 433619456.0,
+            "26": 433619456.0,
+            "27": 433619456.0,
+            "28": 433619456.0,
+            "29": 433619456.0,
+            "30": 433619456.0,
+            "31": 433619456.0,
+            "32": 433619456.0,
+            "33": 433619456.0,
+            "34": 433619456.0,
+            "35": 433619456.0,
+            "36": 433619456.0,
+            "37": 433619456.0,
+            "38": 433619456.0,
+            "39": 433619456.0,
+            "40": 433619456.0,
+            "41": 433619456.0,
+            "42": 433619456.0,
+            "43": 433619456.0,
+            "44": 433619456.0,
+            "45": 433619456.0,
+            "46": 433619456.0,
+            "47": 433619456.0,
+            "48": 433619456.0,
+            "49": 433619456.0,
+            "50": 433619456.0
         }
     },
     "mem-max-allocated-bytes": {
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_dsa/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_dsa/model_config.yaml
new file mode 100644
index 00000000000..63a0933313c
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_dsa/model_config.yaml
@@ -0,0 +1,66 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+  ENABLE_LIGHTWEIGHT_MODE: true
+MODEL_ARGS:
+  --num-layers: 4
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --multi-latent-attention: true
+  --q-lora-rank: 192
+  --kv-lora-rank: 64
+  --qk-head-dim: 16
+  --qk-pos-emb-head-dim: 8
+  --v-head-dim: 16
+  --experimental-attention-variant: dsa
+  --dsa-indexer-n-heads: 64
+  --dsa-indexer-head-dim: 128
+  --dsa-indexer-topk: 2048
+  --dsa-indexer-loss-coeff: 0.01
+  --attention-backend: fused
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 0
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 25
+  --eval-interval: 1000
+  --eval-iters: 10
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --sequence-parallel: true
+  --untie-embeddings-and-output-weights: true
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --attention-backend: unfused
+  --log-memory-to-tensorboard: true
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mhc/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mhc/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..a890eb1b600
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mhc/golden_values_dev_dgx_h100.json
@@ -0,0 +1,287 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 10.86149,
+            "2": 10.85467,
+            "3": 10.86695,
+            "4": 10.84622,
+            "5": 10.88467,
+            "6": 10.89675,
+            "7": 10.87274,
+            "8": 10.86587,
+            "9": 10.86993,
+            "10": 10.83755,
+            "11": 10.8946,
+            "12": 10.8795,
+            "13": 10.87683,
+            "14": 10.90365,
+            "15": 10.83112,
+            "16": 10.8345,
+            "17": 10.80061,
+            "18": 10.82067,
+            "19": 10.81459,
+            "20": 10.71809,
+            "21": 10.68633,
+            "22": 10.53197,
+            "23": 10.70485,
+            "24": 10.58544,
+            "25": 10.51899,
+            "26": 10.58489,
+            "27": 10.60103,
+            "28": 10.53535,
+            "29": 10.57111,
+            "30": 10.33244,
+            "31": 10.05828,
+            "32": 10.42787,
+            "33": 10.42023,
+            "34": 10.16983,
+            "35": 10.23073,
+            "36": 10.18747,
+            "37": 10.31252,
+            "38": 10.14214,
+            "39": 10.38141,
+            "40": 10.04843,
+            "41": 10.10327,
+            "42": 10.17154,
+            "43": 9.78292,
+            "44": 9.90961,
+            "45": 9.78503,
+            "46": 9.76877,
+            "47": 10.10084,
+            "48": 9.80966,
+            "49": 9.48773,
+            "50": 9.86705
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1732.0,
+            "2": 34586.0,
+            "3": 1628.0,
+            "4": 1806.0,
+            "5": 1834.0,
+            "6": 1858.0,
+            "7": 1772.0,
+            "8": 1562.0,
+            "9": 34695.0,
+            "10": 1453.0,
+            "11": 34608.0,
+            "12": 34493.0,
+            "13": 1885.0,
+            "14": 34479.0,
+            "15": 1876.0,
+            "16": 1773.0,
+            "17": 34664.0,
+            "18": 1653.0,
+            "19": 1796.0,
+            "20": 1636.0,
+            "21": 1854.0,
+            "22": 1680.0,
+            "23": 34870.0,
+            "24": 1743.0,
+            "25": 34415.0,
+            "26": 34506.0,
+            "27": 34562.0,
+            "28": 1973.0,
+            "29": 34797.0,
+            "30": 1874.0,
+            "31": 34398.0,
+            "32": 34704.0,
+            "33": 34981.0,
+            "34": 1929.0,
+            "35": 34822.0,
+            "36": 34718.0,
+            "37": 2413.0,
+            "38": 35053.0,
+            "39": 35229.0,
+            "40": 34965.0,
+            "41": 35070.0,
+            "42": 2353.0,
+            "43": 34792.0,
+            "44": 35066.0,
+            "45": 34885.0,
+            "46": 35077.0,
+            "47": 35294.0,
+            "48": 35254.0,
+            "49": 35217.0,
+            "50": 35213.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 555746816.0,
+            "2": 555746816.0,
+            "3": 555746816.0,
+            "4": 555746816.0,
+            "5": 555746816.0,
+            "6": 555746816.0,
+            "7": 555746816.0,
+            "8": 555746816.0,
+            "9": 555746816.0,
+            "10": 555746816.0,
+            "11": 555746816.0,
+            "12": 555746816.0,
+            "13": 555746816.0,
+            "14": 555746816.0,
+            "15": 555746816.0,
+            "16": 555746816.0,
+            "17": 555746816.0,
+            "18": 555746816.0,
+            "19": 555746816.0,
+            "20": 555746816.0,
+            "21": 555746816.0,
+            "22": 555746816.0,
+            "23": 555746816.0,
+            "24": 555746816.0,
+            "25": 555746816.0,
+            "26": 555746816.0,
+            "27": 555746816.0,
+            "28": 555746816.0,
+            "29": 555746816.0,
+            "30": 555746816.0,
+            "31": 555746816.0,
+            "32": 555746816.0,
+            "33": 555746816.0,
+            "34": 555746816.0,
+            "35": 555746816.0,
+            "36": 555746816.0,
+            "37": 555746816.0,
+            "38": 555746816.0,
+            "39": 555746816.0,
+            "40": 555746816.0,
+            "41": 555746816.0,
+            "42": 555746816.0,
+            "43": 555746816.0,
+            "44": 555746816.0,
+            "45": 555746816.0,
+            "46": 555746816.0,
+            "47": 555746816.0,
+            "48": 555746816.0,
+            "49": 555746816.0,
+            "50": 555746816.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": 1728349696.0,
+            "2": 1917909504.0,
+            "3": 1917909504.0,
+            "4": 1917909504.0,
+            "5": 1917909504.0,
+            "6": 1917909504.0,
+            "7": 1917909504.0,
+            "8": 1917909504.0,
+            "9": 1917909504.0,
+            "10": 1917909504.0,
+            "11": 1917909504.0,
+            "12": 1917909504.0,
+            "13": 1917909504.0,
+            "14": 1917909504.0,
+            "15": 1917909504.0,
+            "16": 1917909504.0,
+            "17": 1917909504.0,
+            "18": 1917909504.0,
+            "19": 1917909504.0,
+            "20": 1917909504.0,
+            "21": 1917909504.0,
+            "22": 1917909504.0,
+            "23": 1917909504.0,
+            "24": 1917909504.0,
+            "25": 1917909504.0,
+            "26": 1917909504.0,
+            "27": 1917909504.0,
+            "28": 1917909504.0,
+            "29": 1917909504.0,
+            "30": 1917909504.0,
+            "31": 1917909504.0,
+            "32": 1917909504.0,
+            "33": 1917909504.0,
+            "34": 1917909504.0,
+            "35": 1917909504.0,
+            "36": 1917909504.0,
+            "37": 1917909504.0,
+            "38": 1917909504.0,
+            "39": 1917909504.0,
+            "40": 1917909504.0,
+            "41": 1917909504.0,
+            "42": 1917909504.0,
+            "43": 1917909504.0,
+            "44": 1917909504.0,
+            "45": 1917909504.0,
+            "46": 1917909504.0,
+            "47": 1917909504.0,
+            "48": 1917909504.0,
+            "49": 1917909504.0,
+            "50": 1917909504.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 50,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": 30.27287,
+            "3": 0.63036,
+            "4": 0.62463,
+            "5": 0.62389,
+            "6": 0.62241,
+            "7": 0.62274,
+            "8": 0.62116,
+            "9": 0.62223,
+            "10": 0.62501,
+            "11": 0.62222,
+            "12": 0.62201,
+            "13": 0.6223,
+            "14": 0.62539,
+            "15": 0.62434,
+            "16": 0.62424,
+            "17": 0.62735,
+            "18": 0.62325,
+            "19": 0.62244,
+            "20": 0.62506,
+            "21": 0.62317,
+            "22": 0.62235,
+            "23": 0.625,
+            "24": 0.62205,
+            "25": 0.62519,
+            "26": 0.64769,
+            "27": 0.62564,
+            "28": 0.62374,
+            "29": 0.62533,
+            "30": 0.62018,
+            "31": 0.62779,
+            "32": 0.62201,
+            "33": 0.63514,
+            "34": 0.6314,
+            "35": 0.63737,
+            "36": 0.62906,
+            "37": 0.64653,
+            "38": 0.63058,
+            "39": 0.63017,
+            "40": 0.63041,
+            "41": 0.6331,
+            "42": 0.62522,
+            "43": 0.62568,
+            "44": 0.62119,
+            "45": 0.62536,
+            "46": 0.62217,
+            "47": 0.62615,
+            "48": 0.6199,
+            "49": 0.61769,
+            "50": 0.62242
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mhc/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mhc/model_config.yaml
new file mode 100644
index 00000000000..686c8bdbb59
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_mhc/model_config.yaml
@@ -0,0 +1,62 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  NCCL_ALGO: Ring
+  CUBLAS_WORKSPACE_CONFIG: :4096:8
+MODEL_ARGS:
+  --num-layers: 12
+  --hidden-size: 512
+  --num-attention-heads: 8
+  --log-params-norm: true
+  --log-num-zeros-in-grad: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-timers-to-tensorboard: true
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  --micro-batch-size: 4
+  --global-batch-size: 32
+  --seq-length: 1024
+  --max-position-embeddings: 1024
+  --train-iters: 50
+  --timing-log-level: 0
+  --lr-decay-iters: 320000
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  --distributed-backend: nccl
+  --lr: 0.00015
+  --lr-decay-style: cosine
+  --min-lr: 1.0e-5
+  --weight-decay: 1e-2
+  --clip-grad: 1.0
+  --lr-warmup-fraction: .01
+  --log-interval: 1
+  --save-interval: 25
+  --eval-interval: 50
+  --eval-iters: 50
+  --transformer-impl: transformer_engine
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --deterministic-mode: true
+  --no-gradient-accumulation-fusion: true
+  --attention-softmax-in-fp32: true
+  --use-mcore-models: true
+  --ckpt-format: torch_dist
+  --dist-ckpt-optim-fully-reshardable: true
+  --dist-ckpt-strictness: log_all # backward compatibility for TE changes
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --bf16: true
+  --attention-backend: unfused
+  --sequence-parallel: true
+  --log-memory-to-tensorboard: true
+  --enable-hyper-connections: true
+  --num-residual-streams: 4
+  --mhc-sinkhorn-iterations: 20
+  --mhc-init-gating-factor: 0.01
+  --recompute-granularity: selective
+  --recompute-modules: "[mhc]"
+  --mhc-recompute-layer-num: 2
+  --exit-interval: 50
+TEST_TYPE: ckpt-resume
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json
index 1bf0169b170..d363dc70fbd 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cp2_nondeterministic/golden_values_dev_dgx_gb200_2nd.json
@@ -54,56 +54,56 @@
             "48": "nan",
             "49": "nan",
             "50": "nan",
-            "51": 9.7742,
-            "52": 9.66445,
-            "53": 10.00151,
-            "54": 9.89155,
-            "55": 9.81849,
-            "56": 9.54335,
-            "57": 9.39451,
-            "58": 9.76573,
-            "59": 9.5093,
-            "60": 9.42825,
-            "61": 9.63467,
-            "62": 9.93887,
-            "63": 9.30457,
-            "64": 9.70983,
-            "65": 8.86882,
-            "66": 9.64953,
-            "67": 9.3082,
-            "68": 9.73505,
-            "69": 9.7559,
-            "70": 9.68706,
-            "71": 9.57534,
-            "72": 9.53073,
-            "73": 9.43677,
-            "74": 8.85587,
-            "75": 9.35529,
-            "76": 9.01373,
-            "77": 10.02452,
-            "78": 9.68203,
-            "79": 9.33141,
-            "80": 9.35469,
-            "81": 9.43623,
-            "82": 9.65853,
-            "83": 9.26266,
-            "84": 9.36921,
-            "85": 9.571,
-            "86": 9.03325,
-            "87": 9.55972,
-            "88": 9.71078,
-            "89": 9.5541,
-            "90": 9.78661,
-            "91": 9.29086,
-            "92": 9.31236,
-            "93": 9.03977,
-            "94": 8.78115,
-            "95": 9.49176,
-            "96": 9.4907,
-            "97": 9.25833,
-            "98": 9.63003,
-            "99": 8.84687,
-            "100": 9.36199
+            "51": 10.14648,
+            "52": 10.09007,
+            "53": 10.08632,
+            "54": 10.05935,
+            "55": 10.03369,
+            "56": 10.06932,
+            "57": 10.04886,
+            "58": 10.06744,
+            "59": 10.01115,
+            "60": 10.0301,
+            "61": 9.98646,
+            "62": 9.95577,
+            "63": 10.03613,
+            "64": 9.99284,
+            "65": 9.95563,
+            "66": 9.98785,
+            "67": 9.96342,
+            "68": 9.92253,
+            "69": 9.94164,
+            "70": 9.92521,
+            "71": 9.95654,
+            "72": 9.92241,
+            "73": 9.91239,
+            "74": 9.89751,
+            "75": 9.86715,
+            "76": 9.91401,
+            "77": 9.90392,
+            "78": 9.84434,
+            "79": 9.85067,
+            "80": 9.87108,
+            "81": 9.90115,
+            "82": 9.8465,
+            "83": 9.79872,
+            "84": 9.73036,
+            "85": 9.72615,
+            "86": 9.83221,
+            "87": 9.86368,
+            "88": 9.83602,
+            "89": 9.76485,
+            "90": 9.75916,
+            "91": 9.7743,
+            "92": 9.76237,
+            "93": 9.68717,
+            "94": 9.7765,
+            "95": 9.77478,
+            "96": 9.75119,
+            "97": 9.68105,
+            "98": 9.71934,
+            "99": 9.77089,
+            "100": 9.65735
         }
     },
     "num-zeros": {
@@ -161,56 +161,56 @@
             "48": "nan",
             "49": "nan",
             "50": "nan",
-            "51": 920.0,
-            "52": 908.0,
-            "53": 1052.0,
-            "54": 889.0,
-            "55": 846.0,
-            "56": 985.0,
-            "57": 838.0,
-            "58": 1021.0,
-            "59": 1045.0,
-            "60": 896.0,
-            "61": 987.0,
-            "62": 926.0,
-            "63": 912.0,
-            "64": 1081.0,
-            "65": 991.0,
-            "66": 1095.0,
-            "67": 964.0,
-            "68": 938.0,
-            "69": 1005.0,
-            "70": 1013.0,
-            "71": 1082.0,
-            "72": 896.0,
-            "73": 1035.0,
-            "74": 687.0,
-            "75": 920.0,
-            "76": 1063.0,
-            "77": 1086.0,
-            "78": 1136.0,
-            "79": 1065.0,
-            "80": 1111.0,
-            "81": 1229.0,
-            "82": 1100.0,
-            "83": 944.0,
-            "84": 1182.0,
-            "85": 1100.0,
-            "86": 790.0,
-            "87": 1132.0,
-            "88": 1071.0,
-            "89": 1148.0,
-            "90": 1121.0,
-            "91": 1120.0,
-            "92": 1115.0,
-            "93": 944.0,
-            "94": 1126.0,
-            "95": 1116.0,
-            "96": 1115.0,
-            "97": 995.0,
-            "98": 1234.0,
-            "99": 1120.0,
-            "100": 1148.0
+            "51": 1034.0,
+            "52": 1019.0,
+            "53": 961.0,
+            "54": 1009.0,
+            "55": 982.0,
+            "56": 1088.0,
+            "57": 816.0,
+            "58": 1264.0,
+            "59": 1103.0,
+            "60": 1074.0,
+            "61": 996.0,
+            "62": 1116.0,
+            "63": 1177.0,
+            "64": 1188.0,
+            "65": 954.0,
+            "66": 1142.0,
+            "67": 1259.0,
+            "68": 1163.0,
+            "69": 1112.0,
+            "70": 1086.0,
+            "71": 1020.0,
+            "72": 1018.0,
+            "73": 1183.0,
+            "74": 1142.0,
+            "75": 1044.0,
+            "76": 1199.0,
+            "77": 1297.0,
+            "78": 1122.0,
+            "79": 1098.0,
+            "80": 1072.0,
+            "81": 1203.0,
+            "82": 1183.0,
+            "83": 1052.0,
+            "84": 1004.0,
+            "85": 1024.0,
+            "86": 1090.0,
+            "87": 1112.0,
+            "88": 1131.0,
+            "89": 1106.0,
+            "90": 1280.0,
+            "91": 987.0,
+            "92": 999.0,
+            "93": 1117.0,
+            "94": 1170.0,
+            "95": 1070.0,
+            "96": 1220.0,
+            "97": 1154.0,
+            "98": 1156.0,
+            "99": 1123.0,
+            "100": 1110.0
         }
     },
     "mem-allocated-bytes": {
@@ -375,56 +375,56 @@
             "48": "nan",
             "49": "nan",
             "50": "nan",
-            "51": 889377280.0,
-            "52": 889378304.0,
-            "53": 889378304.0,
+            "51": 889376256.0,
+            "52": 889376768.0,
+            "53": 889376768.0,
             "54": 889378304.0,
             "55": 889378304.0,
             "56": 889378304.0,
             "57": 889378304.0,
             "58": 889378304.0,
             "59": 889378304.0,
-            "60": 889378304.0,
-            "61": 889378304.0,
-            "62": 889378304.0,
-            "63": 889378304.0,
-            "64": 889378304.0,
-            "65": 889378304.0,
-            "66": 889378304.0,
-            "67": 889378304.0,
-            "68": 889378304.0,
-            "69": 889378304.0,
-            "70": 889378304.0,
-            "71": 889378304.0,
-            "72": 889378304.0,
-            "73": 889378304.0,
-            "74": 889378304.0,
-            "75": 889378304.0,
-            "76": 889378304.0,
-            "77": 889378304.0,
-            "78": 889378304.0,
-            "79": 889378304.0,
-            "80": 889378304.0,
-            "81": 889378304.0,
-            "82": 889378304.0,
-            "83": 889378304.0,
-            "84": 889378304.0,
-            "85": 889378304.0,
-            "86": 889378304.0,
-            "87": 889378304.0,
-            "88": 889378304.0,
-            "89": 889378304.0,
-            "90": 889378816.0,
-            "91": 889378816.0,
-            "92": 889378816.0,
-            "93": 889378816.0,
-            "94": 889378816.0,
-            "95": 889378816.0,
-            "96": 889378816.0,
-            "97": 889378816.0,
-            "98": 889378816.0,
-            "99": 889378816.0,
-            "100": 889378816.0
+            "60": 889378816.0,
+            "61": 889378816.0,
+            "62": 889378816.0,
+            "63": 889378816.0,
+            "64": 889378816.0,
+            "65": 889378816.0,
+            "66": 889378816.0,
+            "67": 889378816.0,
+            "68": 889378816.0,
+            "69": 890424832.0,
+            "70": 890426368.0,
+            "71": 890426368.0,
+            "72": 890426368.0,
+            "73": 890426368.0,
+            "74": 890426368.0,
+            "75": 890426368.0,
+            "76": 890426368.0,
+            "77": 890426368.0,
+            "78": 890426368.0,
+            "79": 890426368.0,
+            "80": 890426368.0,
+            "81": 890426368.0,
+            "82": 890426368.0,
+            "83": 890426368.0,
+            "84": 890426368.0,
+            "85": 890426368.0,
+            "86": 890426368.0,
+            "87": 890426368.0,
+            "88": 890426368.0,
+            "89": 890426368.0,
+            "90": 890426368.0,
+            "91": 890426368.0,
+            "92": 890426368.0,
+            "93": 890426368.0,
+            "94": 890426368.0,
+            "95": 890426368.0,
+            "96": 890426880.0,
+            "97": 890426880.0,
+            "98": 890426880.0,
+            "99": 890426880.0,
+            "100": 890426880.0
         }
     },
     "iteration-time": {
@@ -483,55 +483,55 @@
             "49": "nan",
             "50": "nan",
             "51": "nan",
-            "52": 3.77319,
-            "53": 0.42363,
-            "54": 0.41071,
-            "55": 0.41011,
-            "56": 0.40905,
-            "57": 0.40957,
-            "58": 0.41032,
-            "59": 0.40997,
-            "60": 0.4109,
-            "61": 0.4104,
-            "62": 0.40989,
-            "63": 0.40974,
-            "64": 0.40928,
-            "65": 0.40668,
-            "66": 0.4076,
-            "67": 0.41006,
-            "68": 0.41114,
-            "69": 0.40437,
-            "70": 0.40702,
-            "71": 0.4095,
-            "72": 0.41064,
-            "73": 0.40549,
-            "74": 0.40683,
-            "75": 0.4055,
-            "76": 0.40589,
-            "77": 0.40198,
-            "78": 0.40196,
-            "79": 0.40383,
-            "80": 0.40596,
-            "81": 0.40678,
-            "82": 0.40646,
-            "83": 0.40861,
-            "84": 0.40858,
-            "85": 0.40709,
-            "86": 0.40475,
-            "87": 0.41028,
-            "88": 0.40188,
-            "89": 0.40272,
-            "90": 0.4034,
-            "91": 0.40676,
-            "92": 0.40732,
-            "93": 0.40103,
-            "94": 0.40501,
-            "95": 0.4043,
-            "96": 0.40452,
-            "97": 0.40255,
-            "98": 0.40532,
-            "99": 0.40632,
-            "100": 0.4042
+            "52": 6.64879,
+            "53": 0.61877,
+            "54": 0.70232,
+            "55": 0.69398,
+            "56": 0.46021,
+            "57": 0.4538,
+            "58": 0.45586,
+            "59": 0.45496,
+            "60": 0.45627,
+            "61": 0.45912,
+            "62": 0.45861,
+            "63": 0.45656,
+            "64": 0.45957,
+            "65": 0.45711,
+            "66": 0.46007,
+            "67": 0.46004,
+            "68": 0.46286,
+            "69": 0.46393,
+            "70": 0.46128,
+            "71": 0.46073,
+            "72": 0.45621,
+            "73": 0.46188,
+            "74": 0.46133,
+            "75": 0.45991,
+            "76": 0.45837,
+            "77": 0.46117,
+            "78": 0.46033,
+            "79": 0.46132,
+            "80": 0.46496,
+            "81": 0.46253,
+            "82": 0.46437,
+            "83": 0.46136,
+            "84": 0.45869,
+            "85": 0.46353,
+            "86": 0.46817,
+            "87": 0.46638,
+            "88": 0.46933,
+            "89": 0.46096,
+            "90": 0.46113,
+            "91": 0.45879,
+            "92": 0.46143,
+            "93": 0.46364,
+            "94": 0.46259,
+            "95": 0.46784,
+            "96": 0.46382,
+            "97": 0.46115,
+            "98": 0.46216,
+            "99": 0.46091,
+            "100": 0.46141
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json
index 883e45f6fab..d3587276803 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp2_pp2_resume_torch_dist_cross_entropy_loss_fusion/golden_values_dev_dgx_gb200.json
@@ -375,56 +375,56 @@
             "48": 1789536768.0,
             "49": 1789536768.0,
             "50": 1789536768.0,
-            "51": 1789536768.0,
-            "52": 1789536768.0,
-            "53": 1789536768.0,
-            "54": 1789536768.0,
-            "55": 1789536768.0,
-            "56": 1789536768.0,
-            "57": 1789536768.0,
-            "58": 1789536768.0,
-            "59": 1789536768.0,
-            "60": 1789536768.0,
-            "61": 1789536768.0,
-            "62": 1789536768.0,
-            "63": 1789536768.0,
-            "64": 1789536768.0,
-            "65": 1789536768.0,
-            "66": 1789536768.0,
-            "67": 1789536768.0,
-            "68": 1789536768.0,
-            "69": 1789536768.0,
-            "70": 1789536768.0,
-            "71": 1789536768.0,
-            "72": 1789536768.0,
-            "73": 1789536768.0,
-            "74": 1789536768.0,
-            "75": 1789536768.0,
-            "76": 1789536768.0,
-            "77": 1789536768.0,
-            "78": 1789536768.0,
-            "79": 1789536768.0,
-            "80": 1789536768.0,
-            "81": 1789536768.0,
-            "82": 1789536768.0,
-            "83": 1789536768.0,
-            "84": 1789536768.0,
-            "85": 1789536768.0,
-            "86": 1789536768.0,
-            "87": 1789536768.0,
-            "88": 1789536768.0,
-            "89": 1789536768.0,
-            "90": 1789536768.0,
-            "91": 1789536768.0,
-            "92": 1789536768.0,
-            "93": 1789536768.0,
-            "94": 1789536768.0,
-            "95": 1789536768.0,
-            "96": 1789536768.0,
-            "97": 1789536768.0,
-            "98": 1789536768.0,
-            "99": 1789536768.0,
-            "100": 1789536768.0
+            "51": 1790585344.0,
+            "52": 1790585344.0,
+            "53": 1790585344.0,
+            "54": 1790585344.0,
+            "55": 1790585344.0,
+            "56": 1790585344.0,
+            "57": 1790585344.0,
+            "58": 1790585344.0,
+            "59": 1790585344.0,
+            "60": 1790585344.0,
+            "61": 1790585344.0,
+            "62": 1790585344.0,
+            "63": 1790585344.0,
+            "64": 1790585344.0,
+            "65": 1790585344.0,
+            "66": 1790585344.0,
+            "67": 1790585344.0,
+            "68": 1790585344.0,
+            "69": 1790585344.0,
+            "70": 1790585344.0,
+            "71": 1790585344.0,
+            "72": 1790585344.0,
+            "73": 1790585344.0,
+            "74": 1790585344.0,
+            "75": 1790585344.0,
+            "76": 1790585344.0,
+            "77": 1790585344.0,
+            "78": 1790585344.0,
+            "79": 1790585344.0,
+            "80": 1790585344.0,
+            "81": 1790585344.0,
+            "82": 1790585344.0,
+            "83": 1790585344.0,
+            "84": 1790585344.0,
+            "85": 1790585344.0,
+            "86": 1790585344.0,
+            "87": 1790585344.0,
+            "88": 1790585344.0,
+            "89": 1790585344.0,
+            "90": 1790585344.0,
+            "91": 1790585344.0,
+            "92": 1790585344.0,
+            "93": 1790585344.0,
+            "94": 1790585344.0,
+            "95": 1790585344.0,
+            "96": 1790585344.0,
+            "97": 1790585344.0,
+            "98": 1790585344.0,
+            "99": 1790585344.0,
+            "100": 1790585344.0
         }
     },
     "iteration-time": {
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json
index 3c3511a921b..7373ab0f485 100644
--- a/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json
+++ b/tests/functional_tests/test_cases/gpt/gpt3_mcore_te_tp4_pp2_resume_torch_dist_reshard_8x1xNone/golden_values_dev_dgx_gb200_2nd.json
@@ -54,56 +54,56 @@
             "48": "nan",
             "49": "nan",
             "50": "nan",
-            "51": 9.87398,
-            "52": 9.76584,
-            "53": 10.08272,
-            "54": 9.97273,
-            "55": 9.90736,
-            "56": 9.64213,
-            "57": 9.48856,
-            "58": 9.84268,
-            "59": 9.60111,
-            "60": 9.52013,
-            "61": 9.70058,
-            "62": 9.99642,
-            "63": 9.39067,
-            "64": 9.77612,
-            "65": 8.96637,
-            "66": 9.70949,
-            "67": 9.38771,
-            "68": 9.78893,
-            "69": 9.808,
-            "70": 9.74238,
-            "71": 9.63382,
-            "72": 9.59116,
-            "73": 9.50694,
-            "74": 8.94251,
-            "75": 9.42898,
-            "76": 9.0883,
-            "77": 10.07153,
-            "78": 9.72682,
-            "79": 9.38722,
-            "80": 9.40571,
-            "81": 9.48701,
-            "82": 9.70484,
-            "83": 9.31557,
-            "84": 9.42111,
-            "85": 9.61463,
-            "86": 9.08465,
-            "87": 9.59904,
-            "88": 9.75367,
-            "89": 9.606,
-            "90": 9.83155,
-            "91": 9.3388,
-            "92": 9.36037,
-            "93": 9.09036,
-            "94": 8.83711,
-            "95": 9.53804,
-            "96": 9.53392,
-            "97": 9.3132,
-            "98": 9.67422,
-            "99": 8.90347,
-            "100": 9.415
+            "51": 10.20964,
+            "52": 10.16012,
+            "53": 10.16893,
+            "54": 10.13361,
+            "55": 10.102,
+            "56": 10.12861,
+            "57": 10.11452,
+            "58": 10.13273,
+            "59": 10.07955,
+            "60": 10.09559,
+            "61": 10.04554,
+            "62": 10.01873,
+            "63": 10.08757,
+            "64": 10.04247,
+            "65": 10.02478,
+            "66": 10.04463,
+            "67": 10.02055,
+            "68": 9.98332,
+            "69": 10.00776,
+            "70": 9.98606,
+            "71": 10.01226,
+            "72": 9.99599,
+            "73": 9.98797,
+            "74": 9.97055,
+            "75": 9.94879,
+            "76": 9.9789,
+            "77": 9.96756,
+            "78": 9.92657,
+            "79": 9.93447,
+            "80": 9.94843,
+            "81": 9.97028,
+            "82": 9.90233,
+            "83": 9.87175,
+            "84": 9.80507,
+            "85": 9.80349,
+            "86": 9.89571,
+            "87": 9.9128,
+            "88": 9.89511,
+            "89": 9.82998,
+            "90": 9.82269,
+            "91": 9.83966,
+            "92": 9.82838,
+            "93": 9.76216,
+            "94": 9.83801,
+            "95": 9.83339,
+            "96": 9.81431,
+            "97": 9.75671,
+            "98": 9.78566,
+            "99": 9.82922,
+            "100": 9.72458
         }
     },
     "num-zeros": {
@@ -161,56 +161,56 @@
             "48": "nan",
             "49": "nan",
             "50": "nan",
-            "51": 2423.0,
-            "52": 2500.0,
-            "53": 2764.0,
-            "54": 2630.0,
-            "55": 2192.0,
-            "56": 2558.0,
-            "57": 2261.0,
-            "58": 2856.0,
-            "59": 2691.0,
-            "60": 2271.0,
-            "61": 2730.0,
-            "62": 2517.0,
-            "63": 2411.0,
-            "64": 2842.0,
-            "65": 2476.0,
-            "66": 2911.0,
-            "67": 2596.0,
-            "68": 2658.0,
-            "69": 2791.0,
-            "70": 3035.0,
-            "71": 2882.0,
-            "72": 2367.0,
-            "73": 2798.0,
-            "74": 1871.0,
-            "75": 2433.0,
-            "76": 2936.0,
-            "77": 3145.0,
-            "78": 2937.0,
-            "79": 2949.0,
-            "80": 3208.0,
-            "81": 3626.0,
-            "82": 3215.0,
-            "83": 2746.0,
-            "84": 3128.0,
-            "85": 3291.0,
-            "86": 2686.0,
-            "87": 3535.0,
-            "88": 2983.0,
-            "89": 3431.0,
-            "90": 3105.0,
-            "91": 2840.0,
-            "92": 3101.0,
-            "93": 2561.0,
-            "94": 3334.0,
-            "95": 3249.0,
-            "96": 3468.0,
-            "97": 3077.0,
-            "98": 3515.0,
-            "99": 3067.0,
-            "100": 3131.0
+            "51": 2522.0,
+            "52": 2703.0,
+            "53": 2612.0,
+            "54": 2880.0,
+            "55": 2661.0,
+            "56": 2782.0,
+            "57": 2227.0,
+            "58": 3609.0,
+            "59": 2862.0,
+            "60": 2797.0,
+            "61": 2691.0,
+            "62": 3136.0,
+            "63": 3410.0,
+            "64": 3558.0,
+            "65": 2716.0,
+            "66": 3062.0,
+            "67": 3839.0,
+            "68": 3284.0,
+            "69": 2999.0,
+            "70": 3293.0,
+            "71": 3183.0,
+            "72": 2908.0,
+            "73": 3433.0,
+            "74": 3357.0,
+            "75": 3159.0,
+            "76": 3201.0,
+            "77": 3783.0,
+            "78": 3296.0,
+            "79": 3401.0,
+            "80": 3032.0,
+            "81": 3581.0,
+            "82": 2798.0,
+            "83": 3004.0,
+            "84": 3009.0,
+            "85": 2754.0,
+            "86": 2891.0,
+            "87": 2871.0,
+            "88": 2946.0,
+            "89": 3221.0,
+            "90": 3698.0,
+            "91": 2657.0,
+            "92": 3140.0,
+            "93": 3022.0,
+            "94": 2829.0,
+            "95": 2993.0,
+            "96": 3243.0,
+            "97": 3391.0,
+            "98": 3526.0,
+            "99": 3247.0,
+            "100": 3220.0
         }
     },
     "mem-allocated-bytes": {
@@ -483,55 +483,55 @@
             "49": "nan",
             "50": "nan",
             "51": "nan",
-            "52": 3.36268,
-            "53": 0.28919,
-            "54": 0.2725,
-            "55": 0.27972,
-            "56": 0.2728,
-            "57": 0.27382,
-            "58": 0.27288,
-            "59": 0.27294,
-            "60": 0.27575,
-            "61": 0.27075,
-            "62": 0.27057,
-            "63": 0.27211,
-            "64": 0.26991,
-            "65": 0.27298,
-            "66": 0.27045,
-            "67": 0.27231,
-            "68": 0.27315,
-            "69": 0.26969,
-            "70": 0.27037,
-            "71": 0.27028,
-            "72": 0.27191,
-            "73": 0.2714,
-            "74": 0.27082,
-            "75": 0.2722,
-            "76": 0.27153,
-            "77": 0.27331,
-            "78": 0.27142,
-            "79": 0.27368,
-            "80": 0.27144,
-            "81": 0.26895,
-            "82": 0.27139,
-            "83": 0.26946,
-            "84": 0.27033,
-            "85": 0.2702,
-            "86": 0.26955,
-            "87": 0.2686,
-            "88": 0.27213,
-            "89": 0.2709,
-            "90": 0.27061,
-            "91": 0.27274,
-            "92": 0.26989,
-            "93": 0.27031,
-            "94": 0.27054,
-            "95": 0.269,
-            "96": 0.27187,
-            "97": 0.26915,
-            "98": 0.2696,
-            "99": 0.27075,
-            "100": 0.26802
+            "52": 5.54937,
+            "53": 0.31325,
+            "54": 0.29919,
+            "55": 0.29944,
+            "56": 0.29944,
+            "57": 0.30082,
+            "58": 0.30142,
+            "59": 0.3033,
+            "60": 0.30533,
+            "61": 0.30384,
+            "62": 0.30581,
+            "63": 0.30526,
+            "64": 0.30422,
+            "65": 0.30442,
+            "66": 0.30448,
+            "67": 0.30597,
+            "68": 0.30487,
+            "69": 0.30391,
+            "70": 0.3035,
+            "71": 0.30146,
+            "72": 0.30236,
+            "73": 0.30279,
+            "74": 0.30239,
+            "75": 0.30159,
+            "76": 0.30364,
+            "77": 0.30316,
+            "78": 0.30312,
+            "79": 0.30225,
+            "80": 0.30346,
+            "81": 0.30158,
+            "82": 0.30366,
+            "83": 0.30317,
+            "84": 0.30296,
+            "85": 0.30361,
+            "86": 0.3046,
+            "87": 0.30629,
+            "88": 0.30678,
+            "89": 0.30424,
+            "90": 0.30473,
+            "91": 0.30574,
+            "92": 0.30405,
+            "93": 0.30457,
+            "94": 0.30516,
+            "95": 0.30388,
+            "96": 0.30425,
+            "97": 0.30434,
+            "98": 0.30479,
+            "99": 0.30491,
+            "100": 0.30342
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..f56b5fa6f77
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/golden_values_dev_dgx_h100.json
@@ -0,0 +1,10037 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 10.85954,
+            "2": 10.88017,
+            "3": 10.87732,
+            "4": 10.8999,
+            "5": 10.88699,
+            "6": 10.87335,
+            "7": 10.88219,
+            "8": 10.87225,
+            "9": 10.87277,
+            "10": 10.87494,
+            "11": 10.85221,
+            "12": 10.84405,
+            "13": 10.84222,
+            "14": 10.86461,
+            "15": 10.78656,
+            "16": 10.81059,
+            "17": 10.77436,
+            "18": 10.81246,
+            "19": 10.72203,
+            "20": 10.69596,
+            "21": 10.64272,
+            "22": 10.64956,
+            "23": 10.65288,
+            "24": 10.54233,
+            "25": 10.55491,
+            "26": 10.63818,
+            "27": 10.44117,
+            "28": 10.46928,
+            "29": 10.34986,
+            "30": 10.24645,
+            "31": 10.42625,
+            "32": 10.33791,
+            "33": 10.19559,
+            "34": 10.14074,
+            "35": 10.22182,
+            "36": 10.13202,
+            "37": 10.07533,
+            "38": 10.01538,
+            "39": 10.02986,
+            "40": 10.05768,
+            "41": 9.93219,
+            "42": 9.93962,
+            "43": 9.8498,
+            "44": 9.97902,
+            "45": 9.99946,
+            "46": 9.83276,
+            "47": 9.99696,
+            "48": 9.80958,
+            "49": 9.94884,
+            "50": 9.94537,
+            "51": 9.58197,
+            "52": 9.79331,
+            "53": 9.62548,
+            "54": 9.88686,
+            "55": 9.73482,
+            "56": 9.84492,
+            "57": 9.85708,
+            "58": 9.87627,
+            "59": 9.54205,
+            "60": 9.64489,
+            "61": 9.88334,
+            "62": 9.75928,
+            "63": 9.68107,
+            "64": 9.82461,
+            "65": 9.59476,
+            "66": 9.62868,
+            "67": 9.74002,
+            "68": 9.60205,
+            "69": 9.29216,
+            "70": 9.42139,
+            "71": 9.78753,
+            "72": 9.7124,
+            "73": 9.61815,
+            "74": 9.44773,
+            "75": 9.23898,
+            "76": 9.50824,
+            "77": 9.5795,
+            "78": 9.56058,
+            "79": 9.30801,
+            "80": 9.35768,
+            "81": 9.45813,
+            "82": 9.55358,
+            "83": 9.53407,
+            "84": 9.35442,
+            "85": 9.3992,
+            "86": 9.65282,
+            "87": 9.23449,
+            "88": 9.48753,
+            "89": 9.22214,
+            "90": 9.41067,
+            "91": 9.38753,
+            "92": 9.37682,
+            "93": 9.36024,
+            "94": 9.51507,
+            "95": 9.42125,
+            "96": 9.33616,
+            "97": 9.20399,
+            "98": 9.4954,
+            "99": 9.29284,
+            "100": 9.35905,
+            "101": 9.24757,
+            "102": 9.24676,
+            "103": 9.07735,
+            "104": 9.16669,
+            "105": 9.37858,
+            "106": 9.1496,
+            "107": 9.1756,
+            "108": 9.316,
+            "109": 9.29109,
+            "110": 9.36426,
+            "111": 9.17995,
+            "112": 9.23471,
+            "113": 9.35297,
+            "114": 9.35265,
+            "115": 9.32672,
+            "116": 9.00223,
+            "117": 9.06476,
+            "118": 9.06643,
+            "119": 9.22418,
+            "120": 9.08485,
+            "121": 9.19671,
+            "122": 9.14164,
+            "123": 9.25933,
+            "124": 9.45506,
+            "125": 9.21512,
+            "126": 9.06416,
+            "127": 9.01814,
+            "128": 9.22131,
+            "129": 8.98184,
+            "130": 9.13972,
+            "131": 9.15856,
+            "132": 9.03559,
+            "133": 8.85977,
+            "134": 9.18539,
+            "135": 8.88999,
+            "136": 9.16801,
+            "137": 9.15771,
+            "138": 9.23511,
+            "139": 9.09197,
+            "140": 8.87218,
+            "141": 9.29906,
+            "142": 9.19961,
+            "143": 9.1169,
+            "144": 9.24305,
+            "145": 9.10446,
+            "146": 8.98709,
+            "147": 8.98617,
+            "148": 9.13261,
+            "149": 9.06335,
+            "150": 9.01504,
+            "151": 8.92787,
+            "152": 8.8739,
+            "153": 9.06335,
+            "154": 9.17913,
+            "155": 9.13381,
+            "156": 9.04889,
+            "157": 9.15064,
+            "158": 9.04955,
+            "159": 9.03261,
+            "160": 8.88987,
+            "161": 9.04543,
+            "162": 8.89584,
+            "163": 8.84272,
+            "164": 8.97534,
+            "165": 8.93132,
+            "166": 8.65959,
+            "167": 8.83243,
+            "168": 8.81953,
+            "169": 8.6566,
+            "170": 9.04622,
+            "171": 8.72286,
+            "172": 8.82159,
+            "173": 8.91163,
+            "174": 8.84751,
+            "175": 8.70611,
+            "176": 8.75439,
+            "177": 8.7626,
+            "178": 8.7201,
+            "179": 8.64046,
+            "180": 8.74053,
+            "181": 8.69404,
+            "182": 8.72193,
+            "183": 9.08364,
+            "184": 8.6088,
+            "185": 8.88346,
+            "186": 8.74191,
+            "187": 8.56949,
+            "188": 8.67975,
+            "189": 8.86478,
+            "190": 8.53542,
+            "191": 8.66632,
+            "192": 8.61266,
+            "193": 8.57469,
+            "194": 8.75195,
+            "195": 8.59279,
+            "196": 8.77393,
+            "197": 8.74234,
+            "198": 8.62722,
+            "199": 8.77454,
+            "200": 8.73803,
+            "201": 8.66979,
+            "202": 8.54593,
+            "203": 8.54185,
+            "204": 8.71307,
+            "205": 8.2228,
+            "206": 8.8603,
+            "207": 8.68157,
+            "208": 8.70896,
+            "209": 8.75303,
+            "210": 8.57807,
+            "211": 8.84258,
+            "212": 8.49127,
+            "213": 8.57327,
+            "214": 8.51199,
+            "215": 8.5645,
+            "216": 8.50863,
+            "217": 8.53183,
+            "218": 8.52998,
+            "219": 8.64367,
+            "220": 8.54746,
+            "221": 8.39991,
+            "222": 8.50528,
+            "223": 8.43775,
+            "224": 8.53014,
+            "225": 8.57091,
+            "226": 8.4394,
+            "227": 8.67918,
+            "228": 8.38473,
+            "229": 8.45045,
+            "230": 8.49717,
+            "231": 8.49832,
+            "232": 8.49783,
+            "233": 8.49539,
+            "234": 8.63795,
+            "235": 8.55875,
+            "236": 8.39461,
+            "237": 8.48826,
+            "238": 8.30522,
+            "239": 8.562,
+            "240": 8.66952,
+            "241": 8.44144,
+            "242": 8.47219,
+            "243": 8.51768,
+            "244": 8.36825,
+            "245": 8.59274,
+            "246": 8.59497,
+            "247": 8.44008,
+            "248": 8.51279,
+            "249": 8.52035,
+            "250": 8.42183,
+            "251": 8.37751,
+            "252": 8.54393,
+            "253": 8.31454,
+            "254": 8.351,
+            "255": 8.29005,
+            "256": 8.20261,
+            "257": 8.394,
+            "258": 8.45386,
+            "259": 8.23708,
+            "260": 8.2437,
+            "261": 8.23617,
+            "262": 8.34919,
+            "263": 8.30683,
+            "264": 8.18831,
+            "265": 8.33481,
+            "266": 8.23369,
+            "267": 7.89923,
+            "268": 8.38063,
+            "269": 8.40466,
+            "270": 8.26271,
+            "271": 8.279,
+            "272": 8.32109,
+            "273": 8.13747,
+            "274": 8.09677,
+            "275": 8.01372,
+            "276": 7.92611,
+            "277": 8.24041,
+            "278": 8.05017,
+            "279": 7.96688,
+            "280": 7.75652,
+            "281": 8.10713,
+            "282": 8.15049,
+            "283": 8.15621,
+            "284": 8.10354,
+            "285": 8.07234,
+            "286": 7.90454,
+            "287": 7.9963,
+            "288": 8.24862,
+            "289": 8.17575,
+            "290": 8.13093,
+            "291": 8.25763,
+            "292": 8.08131,
+            "293": 8.12059,
+            "294": 7.98178,
+            "295": 7.97108,
+            "296": 8.24114,
+            "297": 7.79647,
+            "298": 8.04847,
+            "299": 7.94257,
+            "300": 7.85748,
+            "301": 8.01649,
+            "302": 7.95112,
+            "303": 7.99606,
+            "304": 7.96394,
+            "305": 8.00301,
+            "306": 7.98312,
+            "307": 7.99372,
+            "308": 8.00491,
+            "309": 8.01362,
+            "310": 7.97824,
+            "311": 7.9323,
+            "312": 7.89419,
+            "313": 7.84054,
+            "314": 7.83,
+            "315": 7.8335,
+            "316": 7.75122,
+            "317": 7.934,
+            "318": 7.98841,
+            "319": 7.83343,
+            "320": 7.57896,
+            "321": 7.75427,
+            "322": 7.83781,
+            "323": 7.7769,
+            "324": 7.91623,
+            "325": 7.80539,
+            "326": 7.65641,
+            "327": 7.86989,
+            "328": 7.79369,
+            "329": 7.89137,
+            "330": 7.7586,
+            "331": 7.52885,
+            "332": 7.81946,
+            "333": 7.84359,
+            "334": 7.68375,
+            "335": 7.69975,
+            "336": 7.91931,
+            "337": 7.65356,
+            "338": 7.90277,
+            "339": 7.7307,
+            "340": 7.7606,
+            "341": 7.70898,
+            "342": 7.82827,
+            "343": 7.61824,
+            "344": 7.58818,
+            "345": 7.61602,
+            "346": 7.46415,
+            "347": 7.5612,
+            "348": 7.68737,
+            "349": 7.58361,
+            "350": 7.65762,
+            "351": 7.75424,
+            "352": 7.711,
+            "353": 7.50477,
+            "354": 7.74925,
+            "355": 7.77011,
+            "356": 7.78305,
+            "357": 7.81855,
+            "358": 7.60031,
+            "359": 7.55187,
+            "360": 7.63213,
+            "361": 7.55298,
+            "362": 7.76875,
+            "363": 7.59465,
+            "364": 7.57928,
+            "365": 7.62839,
+            "366": 7.31096,
+            "367": 7.55919,
+            "368": 7.44577,
+            "369": 7.3551,
+            "370": 7.46985,
+            "371": 7.46609,
+            "372": 7.65475,
+            "373": 7.52989,
+            "374": 7.44843,
+            "375": 7.53627,
+            "376": 7.35288,
+            "377": 7.24313,
+            "378": 7.54312,
+            "379": 7.4994,
+            "380": 7.38859,
+            "381": 7.47577,
+            "382": 7.29951,
+            "383": 7.28478,
+            "384": 7.4126,
+            "385": 7.39829,
+            "386": 7.23652,
+            "387": 7.42535,
+            "388": 7.28487,
+            "389": 7.44425,
+            "390": 7.24578,
+            "391": 7.6482,
+            "392": 7.34245,
+            "393": 7.42463,
+            "394": 7.48248,
+            "395": 7.44483,
+            "396": 7.29231,
+            "397": 7.23386,
+            "398": 7.42507,
+            "399": 7.16173,
+            "400": 7.30149,
+            "401": 7.3585,
+            "402": 7.39832,
+            "403": 7.28806,
+            "404": 7.30832,
+            "405": 7.27202,
+            "406": 7.22485,
+            "407": 7.36688,
+            "408": 7.18877,
+            "409": 7.17334,
+            "410": 7.31999,
+            "411": 7.2223,
+            "412": 7.20595,
+            "413": 7.24047,
+            "414": 6.9176,
+            "415": 7.3341,
+            "416": 7.43139,
+            "417": 7.0298,
+            "418": 7.28201,
+            "419": 7.04286,
+            "420": 7.41864,
+            "421": 7.18456,
+            "422": 7.24003,
+            "423": 7.09785,
+            "424": 7.24581,
+            "425": 7.32182,
+            "426": 7.29342,
+            "427": 7.1359,
+            "428": 7.09617,
+            "429": 6.87976,
+            "430": 7.20691,
+            "431": 7.00662,
+            "432": 7.23762,
+            "433": 6.97996,
+            "434": 6.96131,
+            "435": 7.02219,
+            "436": 7.01484,
+            "437": 6.9921,
+            "438": 7.00514,
+            "439": 6.94235,
+            "440": 7.06367,
+            "441": 7.04936,
+            "442": 7.10187,
+            "443": 7.0941,
+            "444": 6.71175,
+            "445": 6.99825,
+            "446": 7.14631,
+            "447": 7.12745,
+            "448": 6.98621,
+            "449": 7.0508,
+            "450": 7.01761,
+            "451": 6.83255,
+            "452": 6.9157,
+            "453": 7.02056,
+            "454": 6.97019,
+            "455": 7.03145,
+            "456": 6.99451,
+            "457": 6.97283,
+            "458": 6.9066,
+            "459": 6.69482,
+            "460": 7.06773,
+            "461": 7.09857,
+            "462": 6.87116,
+            "463": 7.05522,
+            "464": 6.64922,
+            "465": 7.02852,
+            "466": 7.00594,
+            "467": 6.99935,
+            "468": 6.95215,
+            "469": 6.8291,
+            "470": 7.04615,
+            "471": 6.88316,
+            "472": 6.96104,
+            "473": 6.82398,
+            "474": 6.97228,
+            "475": 7.16917,
+            "476": 6.76379,
+            "477": 6.89771,
+            "478": 6.91142,
+            "479": 6.70396,
+            "480": 7.03025,
+            "481": 6.99763,
+            "482": 6.73608,
+            "483": 6.78502,
+            "484": 6.75413,
+            "485": 6.93205,
+            "486": 7.06796,
+            "487": 6.63653,
+            "488": 6.88737,
+            "489": 6.77108,
+            "490": 6.82685,
+            "491": 6.71122,
+            "492": 6.69849,
+            "493": 6.77155,
+            "494": 6.67651,
+            "495": 6.63733,
+            "496": 6.59006,
+            "497": 6.84564,
+            "498": 6.65256,
+            "499": 6.85952,
+            "500": 6.65795,
+            "501": 6.73562,
+            "502": 6.84527,
+            "503": 6.71173,
+            "504": 6.62075,
+            "505": 6.62291,
+            "506": 6.75234,
+            "507": 6.86844,
+            "508": 6.86157,
+            "509": 6.6555,
+            "510": 6.82834,
+            "511": 6.74132,
+            "512": 6.74051,
+            "513": 6.66032,
+            "514": 6.71273,
+            "515": 6.45045,
+            "516": 6.74436,
+            "517": 6.71073,
+            "518": 6.53817,
+            "519": 6.63527,
+            "520": 6.85868,
+            "521": 6.66571,
+            "522": 6.70871,
+            "523": 6.74553,
+            "524": 6.73396,
+            "525": 6.6762,
+            "526": 6.4139,
+            "527": 6.79901,
+            "528": 6.66011,
+            "529": 6.63182,
+            "530": 6.62611,
+            "531": 6.64289,
+            "532": 6.63292,
+            "533": 6.76391,
+            "534": 6.61301,
+            "535": 6.74754,
+            "536": 6.62605,
+            "537": 6.63867,
+            "538": 6.53166,
+            "539": 6.5542,
+            "540": 6.5862,
+            "541": 6.45207,
+            "542": 6.66957,
+            "543": 6.68064,
+            "544": 6.67601,
+            "545": 6.81307,
+            "546": 6.63333,
+            "547": 6.41838,
+            "548": 6.72367,
+            "549": 6.69982,
+            "550": 6.52974,
+            "551": 6.7478,
+            "552": 6.63991,
+            "553": 6.48451,
+            "554": 6.63407,
+            "555": 6.4629,
+            "556": 6.61792,
+            "557": 6.63496,
+            "558": 6.3874,
+            "559": 6.37379,
+            "560": 6.58293,
+            "561": 6.73352,
+            "562": 6.6356,
+            "563": 6.7444,
+            "564": 6.35291,
+            "565": 6.51482,
+            "566": 6.70247,
+            "567": 6.56973,
+            "568": 6.51145,
+            "569": 6.45578,
+            "570": 6.36768,
+            "571": 6.63597,
+            "572": 6.31359,
+            "573": 6.58668,
+            "574": 6.47613,
+            "575": 6.64961,
+            "576": 6.5168,
+            "577": 6.53078,
+            "578": 6.4847,
+            "579": 6.46709,
+            "580": 6.56793,
+            "581": 6.60857,
+            "582": 6.48362,
+            "583": 6.51541,
+            "584": 6.52831,
+            "585": 6.42713,
+            "586": 6.4178,
+            "587": 6.46113,
+            "588": 6.56878,
+            "589": 6.62653,
+            "590": 6.29114,
+            "591": 6.67541,
+            "592": 6.26902,
+            "593": 6.4773,
+            "594": 6.38719,
+            "595": 6.3632,
+            "596": 6.26099,
+            "597": 6.18986,
+            "598": 6.45726,
+            "599": 6.3998,
+            "600": 6.45709,
+            "601": 6.26132,
+            "602": 6.5338,
+            "603": 6.52288,
+            "604": 6.38993,
+            "605": 6.49993,
+            "606": 6.31475,
+            "607": 6.53507,
+            "608": 6.67525,
+            "609": 6.17714,
+            "610": 6.57295,
+            "611": 6.40188,
+            "612": 6.57929,
+            "613": 6.42667,
+            "614": 6.20672,
+            "615": 6.40081,
+            "616": 6.36019,
+            "617": 6.37969,
+            "618": 6.4512,
+            "619": 6.14244,
+            "620": 6.41233,
+            "621": 6.46338,
+            "622": 6.40096,
+            "623": 6.58352,
+            "624": 6.36078,
+            "625": 6.28553,
+            "626": 6.30525,
+            "627": 6.44574,
+            "628": 6.2557,
+            "629": 6.58813,
+            "630": 6.36641,
+            "631": 6.3498,
+            "632": 6.30972,
+            "633": 6.25733,
+            "634": 6.30887,
+            "635": 6.54592,
+            "636": 6.24834,
+            "637": 6.63634,
+            "638": 6.02046,
+            "639": 6.2798,
+            "640": 6.29548,
+            "641": 6.20953,
+            "642": 6.28471,
+            "643": 6.461,
+            "644": 6.25863,
+            "645": 6.25115,
+            "646": 6.40601,
+            "647": 6.33707,
+            "648": 6.35671,
+            "649": 6.3488,
+            "650": 6.48415,
+            "651": 6.33395,
+            "652": 6.25233,
+            "653": 6.3826,
+            "654": 6.45063,
+            "655": 6.52494,
+            "656": 6.32781,
+            "657": 6.43503,
+            "658": 6.24353,
+            "659": 6.1554,
+            "660": 6.39397,
+            "661": 6.17184,
+            "662": 6.27494,
+            "663": 6.37237,
+            "664": 6.33376,
+            "665": 6.40442,
+            "666": 6.16399,
+            "667": 6.1965,
+            "668": 6.2366,
+            "669": 6.21813,
+            "670": 6.24601,
+            "671": 6.24468,
+            "672": 6.49032,
+            "673": 6.34071,
+            "674": 6.2969,
+            "675": 6.38396,
+            "676": 6.39021,
+            "677": 6.30588,
+            "678": 6.27751,
+            "679": 6.23892,
+            "680": 6.2942,
+            "681": 6.20621,
+            "682": 6.08719,
+            "683": 6.27464,
+            "684": 6.32896,
+            "685": 6.30248,
+            "686": 6.15397,
+            "687": 6.2862,
+            "688": 6.20754,
+            "689": 6.6215,
+            "690": 6.17931,
+            "691": 6.18188,
+            "692": 6.2745,
+            "693": 6.14405,
+            "694": 6.23487,
+            "695": 6.32617,
+            "696": 6.11842,
+            "697": 6.15483,
+            "698": 6.23128,
+            "699": 6.46051,
+            "700": 6.0454,
+            "701": 6.06467,
+            "702": 6.25219,
+            "703": 6.18603,
+            "704": 6.21704,
+            "705": 6.13155,
+            "706": 6.07593,
+            "707": 6.25376,
+            "708": 6.31553,
+            "709": 6.01087,
+            "710": 6.16305,
+            "711": 6.26062,
+            "712": 6.18307,
+            "713": 5.89806,
+            "714": 6.10759,
+            "715": 6.11617,
+            "716": 6.41405,
+            "717": 6.19202,
+            "718": 6.2345,
+            "719": 6.27471,
+            "720": 6.26372,
+            "721": 6.26277,
+            "722": 6.23442,
+            "723": 6.0814,
+            "724": 6.22797,
+            "725": 6.04057,
+            "726": 6.30046,
+            "727": 6.01682,
+            "728": 6.04617,
+            "729": 6.09111,
+            "730": 6.18359,
+            "731": 6.10398,
+            "732": 6.08898,
+            "733": 6.12312,
+            "734": 6.38423,
+            "735": 6.27849,
+            "736": 6.18184,
+            "737": 6.36645,
+            "738": 6.13411,
+            "739": 6.14591,
+            "740": 5.87975,
+            "741": 6.00667,
+            "742": 5.98459,
+            "743": 6.17495,
+            "744": 6.02962,
+            "745": 6.15497,
+            "746": 6.03272,
+            "747": 6.09789,
+            "748": 6.23436,
+            "749": 5.94191,
+            "750": 6.16819,
+            "751": 5.9596,
+            "752": 6.01941,
+            "753": 6.02989,
+            "754": 6.28798,
+            "755": 6.13521,
+            "756": 6.25357,
+            "757": 6.02098,
+            "758": 6.20422,
+            "759": 6.23062,
+            "760": 6.02316,
+            "761": 6.19655,
+            "762": 6.22713,
+            "763": 6.03754,
+            "764": 5.9636,
+            "765": 5.93413,
+            "766": 5.97155,
+            "767": 5.81277,
+            "768": 6.18725,
+            "769": 6.27646,
+            "770": 6.29561,
+            "771": 5.78767,
+            "772": 6.03281,
+            "773": 6.18558,
+            "774": 5.88583,
+            "775": 6.03167,
+            "776": 6.13086,
+            "777": 5.88612,
+            "778": 6.05891,
+            "779": 5.87414,
+            "780": 6.14047,
+            "781": 5.85641,
+            "782": 6.04961,
+            "783": 5.95687,
+            "784": 5.91852,
+            "785": 6.09816,
+            "786": 6.10929,
+            "787": 5.66006,
+            "788": 5.99915,
+            "789": 6.21789,
+            "790": 6.26737,
+            "791": 5.79122,
+            "792": 5.99828,
+            "793": 6.18387,
+            "794": 6.02746,
+            "795": 6.0051,
+            "796": 6.17065,
+            "797": 6.05376,
+            "798": 6.06076,
+            "799": 6.11682,
+            "800": 6.02167,
+            "801": 6.15011,
+            "802": 5.98473,
+            "803": 6.15363,
+            "804": 6.00859,
+            "805": 5.83055,
+            "806": 6.08757,
+            "807": 6.04997,
+            "808": 5.92717,
+            "809": 5.77802,
+            "810": 6.01973,
+            "811": 5.93299,
+            "812": 5.91169,
+            "813": 5.96567,
+            "814": 6.0369,
+            "815": 5.8146,
+            "816": 6.12034,
+            "817": 5.94337,
+            "818": 6.0674,
+            "819": 6.01476,
+            "820": 5.7319,
+            "821": 5.95027,
+            "822": 6.20452,
+            "823": 5.83139,
+            "824": 5.98275,
+            "825": 6.18795,
+            "826": 6.20019,
+            "827": 6.05802,
+            "828": 6.06976,
+            "829": 5.89149,
+            "830": 5.94221,
+            "831": 5.89773,
+            "832": 5.97341,
+            "833": 6.06501,
+            "834": 5.99675,
+            "835": 6.00654,
+            "836": 5.79277,
+            "837": 6.11496,
+            "838": 5.86966,
+            "839": 5.83554,
+            "840": 6.18614,
+            "841": 5.78491,
+            "842": 5.89169,
+            "843": 5.95102,
+            "844": 6.00954,
+            "845": 6.09153,
+            "846": 5.68733,
+            "847": 5.75715,
+            "848": 5.96838,
+            "849": 6.09512,
+            "850": 5.84886,
+            "851": 6.01693,
+            "852": 5.75188,
+            "853": 5.99355,
+            "854": 6.01844,
+            "855": 5.81656,
+            "856": 5.99593,
+            "857": 6.00207,
+            "858": 6.05507,
+            "859": 5.95295,
+            "860": 6.09632,
+            "861": 6.07189,
+            "862": 6.00434,
+            "863": 5.83757,
+            "864": 5.84474,
+            "865": 5.93791,
+            "866": 5.89404,
+            "867": 5.87803,
+            "868": 6.06515,
+            "869": 6.08564,
+            "870": 5.97153,
+            "871": 6.04317,
+            "872": 5.89525,
+            "873": 5.84383,
+            "874": 6.02742,
+            "875": 5.9144,
+            "876": 5.96905,
+            "877": 5.92979,
+            "878": 6.09819,
+            "879": 5.76783,
+            "880": 6.01501,
+            "881": 5.99647,
+            "882": 5.9097,
+            "883": 5.67626,
+            "884": 5.96521,
+            "885": 5.74544,
+            "886": 5.99268,
+            "887": 5.90979,
+            "888": 5.83897,
+            "889": 6.01033,
+            "890": 6.02378,
+            "891": 5.95247,
+            "892": 5.70829,
+            "893": 6.0922,
+            "894": 5.73134,
+            "895": 5.84057,
+            "896": 5.84075,
+            "897": 5.8564,
+            "898": 5.9238,
+            "899": 5.93486,
+            "900": 5.89946,
+            "901": 5.95293,
+            "902": 5.83295,
+            "903": 6.05665,
+            "904": 5.93153,
+            "905": 5.90441,
+            "906": 5.6172,
+            "907": 5.91178,
+            "908": 5.73853,
+            "909": 5.99118,
+            "910": 5.86603,
+            "911": 5.70397,
+            "912": 5.70712,
+            "913": 5.76497,
+            "914": 5.83944,
+            "915": 5.80032,
+            "916": 5.8904,
+            "917": 5.86913,
+            "918": 5.82415,
+            "919": 5.81575,
+            "920": 5.89552,
+            "921": 5.84163,
+            "922": 5.62427,
+            "923": 6.03657,
+            "924": 5.60536,
+            "925": 5.62335,
+            "926": 5.86148,
+            "927": 5.96071,
+            "928": 5.84005,
+            "929": 5.82702,
+            "930": 5.95816,
+            "931": 5.765,
+            "932": 5.59211,
+            "933": 5.6351,
+            "934": 5.80541,
+            "935": 5.63715,
+            "936": 5.83772,
+            "937": 5.96629,
+            "938": 5.59109,
+            "939": 5.7899,
+            "940": 5.96726,
+            "941": 5.7264,
+            "942": 5.83547,
+            "943": 5.86622,
+            "944": 5.95478,
+            "945": 5.70263,
+            "946": 5.55832,
+            "947": 5.74831,
+            "948": 5.79312,
+            "949": 5.8268,
+            "950": 5.84353,
+            "951": 5.72242,
+            "952": 5.69295,
+            "953": 5.67852,
+            "954": 5.72473,
+            "955": 5.53107,
+            "956": 5.62074,
+            "957": 5.84076,
+            "958": 5.79676,
+            "959": 5.57317,
+            "960": 5.80125,
+            "961": 5.82952,
+            "962": 5.76695,
+            "963": 5.76461,
+            "964": 5.70677,
+            "965": 5.64012,
+            "966": 5.59617,
+            "967": 5.72434,
+            "968": 5.74036,
+            "969": 5.82392,
+            "970": 5.64422,
+            "971": 5.7065,
+            "972": 5.85308,
+            "973": 5.66884,
+            "974": 5.71841,
+            "975": 5.86273,
+            "976": 5.70493,
+            "977": 5.77104,
+            "978": 5.6858,
+            "979": 5.58655,
+            "980": 5.75924,
+            "981": 5.8969,
+            "982": 5.47038,
+            "983": 5.61817,
+            "984": 5.54504,
+            "985": 5.59032,
+            "986": 5.64132,
+            "987": 5.56966,
+            "988": 5.70939,
+            "989": 5.69379,
+            "990": 5.62195,
+            "991": 5.84899,
+            "992": 5.77877,
+            "993": 5.87022,
+            "994": 5.69735,
+            "995": 5.73242,
+            "996": 5.73704,
+            "997": 5.81329,
+            "998": 5.83634,
+            "999": 5.83399,
+            "1000": 5.68342,
+            "1001": 5.86668,
+            "1002": 5.76052,
+            "1003": 5.64259,
+            "1004": 5.79811,
+            "1005": 5.53617,
+            "1006": 5.326,
+            "1007": 5.76701,
+            "1008": 5.79136,
+            "1009": 5.65046,
+            "1010": 5.77942,
+            "1011": 5.89493,
+            "1012": 5.62303,
+            "1013": 5.61569,
+            "1014": 5.68111,
+            "1015": 5.55747,
+            "1016": 5.87327,
+            "1017": 5.83312,
+            "1018": 5.61865,
+            "1019": 5.73414,
+            "1020": 5.61755,
+            "1021": 5.848,
+            "1022": 5.50045,
+            "1023": 5.65182,
+            "1024": 5.74493,
+            "1025": 5.5692,
+            "1026": 5.41415,
+            "1027": 5.60696,
+            "1028": 5.6928,
+            "1029": 5.68764,
+            "1030": 5.68746,
+            "1031": 5.40696,
+            "1032": 5.78748,
+            "1033": 5.58136,
+            "1034": 5.61937,
+            "1035": 5.71368,
+            "1036": 5.62818,
+            "1037": 5.3679,
+            "1038": 5.66452,
+            "1039": 5.64347,
+            "1040": 5.57004,
+            "1041": 5.59722,
+            "1042": 5.81329,
+            "1043": 5.566,
+            "1044": 5.46906,
+            "1045": 5.9659,
+            "1046": 5.4866,
+            "1047": 5.38954,
+            "1048": 5.50027,
+            "1049": 5.67182,
+            "1050": 5.6991,
+            "1051": 5.57928,
+            "1052": 5.68227,
+            "1053": 5.62737,
+            "1054": 5.45766,
+            "1055": 5.60313,
+            "1056": 5.67386,
+            "1057": 5.75895,
+            "1058": 5.56782,
+            "1059": 5.74888,
+            "1060": 5.82022,
+            "1061": 5.47624,
+            "1062": 5.64897,
+            "1063": 5.50121,
+            "1064": 5.59136,
+            "1065": 5.55347,
+            "1066": 5.74367,
+            "1067": 5.67235,
+            "1068": 5.44068,
+            "1069": 5.60636,
+            "1070": 5.81264,
+            "1071": 5.51129,
+            "1072": 5.61871,
+            "1073": 5.62147,
+            "1074": 5.524,
+            "1075": 5.70529,
+            "1076": 5.5934,
+            "1077": 5.71153,
+            "1078": 5.56524,
+            "1079": 5.61728,
+            "1080": 5.64251,
+            "1081": 5.62319,
+            "1082": 5.49648,
+            "1083": 5.64086,
+            "1084": 5.55389,
+            "1085": 5.40631,
+            "1086": 5.62008,
+            "1087": 5.44148,
+            "1088": 5.51218,
+            "1089": 5.7676,
+            "1090": 5.53165,
+            "1091": 5.51388,
+            "1092": 5.41011,
+            "1093": 5.70025,
+            "1094": 5.57364,
+            "1095": 5.57735,
+            "1096": 5.61585,
+            "1097": 5.64586,
+            "1098": 5.64877,
+            "1099": 5.51631,
+            "1100": 5.63778,
+            "1101": 5.67335,
+            "1102": 5.54037,
+            "1103": 5.54969,
+            "1104": 5.53882,
+            "1105": 5.54754,
+            "1106": 5.68315,
+            "1107": 5.68556,
+            "1108": 5.78611,
+            "1109": 5.53666,
+            "1110": 5.66598,
+            "1111": 5.58973,
+            "1112": 5.58039,
+            "1113": 5.62611,
+            "1114": 5.61279,
+            "1115": 5.59718,
+            "1116": 5.65925,
+            "1117": 5.64676,
+            "1118": 5.65036,
+            "1119": 5.70919,
+            "1120": 5.62738,
+            "1121": 5.37352,
+            "1122": 5.22976,
+            "1123": 5.47237,
+            "1124": 5.64939,
+            "1125": 5.67974,
+            "1126": 5.679,
+            "1127": 5.56811,
+            "1128": 5.61992,
+            "1129": 5.29637,
+            "1130": 5.54359,
+            "1131": 5.63153,
+            "1132": 5.72427,
+            "1133": 5.51914,
+            "1134": 5.56063,
+            "1135": 5.52056,
+            "1136": 5.42646,
+            "1137": 5.45971,
+            "1138": 5.56927,
+            "1139": 5.41452,
+            "1140": 5.2656,
+            "1141": 5.58265,
+            "1142": 5.64152,
+            "1143": 5.38298,
+            "1144": 5.38584,
+            "1145": 5.36231,
+            "1146": 5.63508,
+            "1147": 5.49183,
+            "1148": 5.50524,
+            "1149": 5.52352,
+            "1150": 5.39801,
+            "1151": 5.5563,
+            "1152": 5.41525,
+            "1153": 5.44791,
+            "1154": 5.49757,
+            "1155": 5.43833,
+            "1156": 5.3488,
+            "1157": 5.66444,
+            "1158": 5.39487,
+            "1159": 5.33455,
+            "1160": 5.79503,
+            "1161": 5.53955,
+            "1162": 5.45818,
+            "1163": 5.52563,
+            "1164": 5.3837,
+            "1165": 5.52861,
+            "1166": 5.48753,
+            "1167": 5.36312,
+            "1168": 5.49491,
+            "1169": 5.39842,
+            "1170": 5.59202,
+            "1171": 5.48502,
+            "1172": 5.64238,
+            "1173": 5.62295,
+            "1174": 5.50843,
+            "1175": 5.34639,
+            "1176": 5.38504,
+            "1177": 5.55461,
+            "1178": 5.46852,
+            "1179": 5.49505,
+            "1180": 5.46014,
+            "1181": 5.56031,
+            "1182": 5.59593,
+            "1183": 5.77155,
+            "1184": 5.54926,
+            "1185": 5.29008,
+            "1186": 5.60451,
+            "1187": 5.55363,
+            "1188": 5.51655,
+            "1189": 5.39133,
+            "1190": 5.40482,
+            "1191": 5.39266,
+            "1192": 5.50142,
+            "1193": 5.46347,
+            "1194": 5.45607,
+            "1195": 5.32751,
+            "1196": 5.52219,
+            "1197": 5.4809,
+            "1198": 5.52789,
+            "1199": 5.3874,
+            "1200": 5.33059,
+            "1201": 5.48969,
+            "1202": 5.43584,
+            "1203": 5.49537,
+            "1204": 5.40861,
+            "1205": 5.48971,
+            "1206": 5.3371,
+            "1207": 5.58625,
+            "1208": 5.4312,
+            "1209": 5.29323,
+            "1210": 5.50765,
+            "1211": 5.51506,
+            "1212": 5.59777,
+            "1213": 5.42123,
+            "1214": 5.51018,
+            "1215": 5.23832,
+            "1216": 5.40989,
+            "1217": 5.38537,
+            "1218": 5.45232,
+            "1219": 5.48221,
+            "1220": 5.38594,
+            "1221": 5.44848,
+            "1222": 5.31032,
+            "1223": 5.47835,
+            "1224": 5.42017,
+            "1225": 5.43499,
+            "1226": 5.3238,
+            "1227": 5.47632,
+            "1228": 5.72418,
+            "1229": 5.32629,
+            "1230": 5.40556,
+            "1231": 5.06972,
+            "1232": 5.78794,
+            "1233": 5.28923,
+            "1234": 5.24535,
+            "1235": 5.37092,
+            "1236": 5.48471,
+            "1237": 5.20864,
+            "1238": 5.41643,
+            "1239": 5.40751,
+            "1240": 5.46767,
+            "1241": 5.57266,
+            "1242": 5.4536,
+            "1243": 5.43063,
+            "1244": 5.51812,
+            "1245": 5.19115,
+            "1246": 5.72042,
+            "1247": 5.43187,
+            "1248": 5.30004,
+            "1249": 5.40113,
+            "1250": 5.33798,
+            "1251": 5.42034,
+            "1252": 5.57217,
+            "1253": 5.48773,
+            "1254": 5.30628,
+            "1255": 5.51443,
+            "1256": 5.60755,
+            "1257": 5.4214,
+            "1258": 5.56457,
+            "1259": 5.48027,
+            "1260": 5.51461,
+            "1261": 5.63883,
+            "1262": 5.39531,
+            "1263": 5.32916,
+            "1264": 5.50671,
+            "1265": 5.30632,
+            "1266": 5.23819,
+            "1267": 5.37206,
+            "1268": 5.39267,
+            "1269": 5.15366,
+            "1270": 5.40418,
+            "1271": 5.27732,
+            "1272": 5.5252,
+            "1273": 5.30228,
+            "1274": 5.3516,
+            "1275": 5.38466,
+            "1276": 5.39786,
+            "1277": 5.46218,
+            "1278": 5.34689,
+            "1279": 5.44274,
+            "1280": 5.45919,
+            "1281": 5.40638,
+            "1282": 5.3824,
+            "1283": 5.42204,
+            "1284": 5.34841,
+            "1285": 5.50133,
+            "1286": 5.33557,
+            "1287": 5.58795,
+            "1288": 5.26493,
+            "1289": 5.429,
+            "1290": 5.50282,
+            "1291": 5.50335,
+            "1292": 5.44662,
+            "1293": 5.41955,
+            "1294": 5.49953,
+            "1295": 5.34675,
+            "1296": 5.19062,
+            "1297": 5.17238,
+            "1298": 5.11916,
+            "1299": 5.30339,
+            "1300": 5.21032,
+            "1301": 5.30157,
+            "1302": 5.27472,
+            "1303": 5.36107,
+            "1304": 5.43231,
+            "1305": 5.36999,
+            "1306": 5.25347,
+            "1307": 5.18829,
+            "1308": 5.27033,
+            "1309": 5.40736,
+            "1310": 5.26399,
+            "1311": 5.38109,
+            "1312": 5.35438,
+            "1313": 5.30056,
+            "1314": 5.2953,
+            "1315": 5.42245,
+            "1316": 5.26148,
+            "1317": 5.28065,
+            "1318": 5.2198,
+            "1319": 5.34619,
+            "1320": 5.42093,
+            "1321": 5.44976,
+            "1322": 5.46399,
+            "1323": 5.37327,
+            "1324": 5.25463,
+            "1325": 5.40657,
+            "1326": 5.54082,
+            "1327": 5.39378,
+            "1328": 5.21893,
+            "1329": 5.41851,
+            "1330": 5.40079,
+            "1331": 5.31685,
+            "1332": 5.31253,
+            "1333": 5.37243,
+            "1334": 5.44685,
+            "1335": 5.37136,
+            "1336": 5.43779,
+            "1337": 5.47852,
+            "1338": 5.30292,
+            "1339": 5.14181,
+            "1340": 5.41486,
+            "1341": 5.3443,
+            "1342": 5.36197,
+            "1343": 5.47816,
+            "1344": 5.37832,
+            "1345": 5.34294,
+            "1346": 5.08195,
+            "1347": 5.38558,
+            "1348": 5.4918,
+            "1349": 5.40832,
+            "1350": 5.02622,
+            "1351": 5.3151,
+            "1352": 5.1591,
+            "1353": 5.34674,
+            "1354": 5.35963,
+            "1355": 5.11092,
+            "1356": 5.2587,
+            "1357": 5.29209,
+            "1358": 5.15773,
+            "1359": 5.11035,
+            "1360": 5.17288,
+            "1361": 5.30521,
+            "1362": 5.06318,
+            "1363": 5.2947,
+            "1364": 5.40031,
+            "1365": 5.02241,
+            "1366": 5.11779,
+            "1367": 5.33051,
+            "1368": 5.18648,
+            "1369": 5.22984,
+            "1370": 5.19906,
+            "1371": 5.2839,
+            "1372": 5.26155,
+            "1373": 5.28402,
+            "1374": 5.28112,
+            "1375": 5.46052,
+            "1376": 5.2713,
+            "1377": 5.26467,
+            "1378": 5.31344,
+            "1379": 5.22741,
+            "1380": 5.26107,
+            "1381": 5.47871,
+            "1382": 5.08923,
+            "1383": 5.375,
+            "1384": 5.35914,
+            "1385": 5.38983,
+            "1386": 5.16417,
+            "1387": 5.16094,
+            "1388": 5.28017,
+            "1389": 5.30376,
+            "1390": 5.25514,
+            "1391": 5.26911,
+            "1392": 5.37008,
+            "1393": 5.38307,
+            "1394": 5.40394,
+            "1395": 5.32492,
+            "1396": 5.21356,
+            "1397": 5.28,
+            "1398": 5.37051,
+            "1399": 5.35873,
+            "1400": 5.26512,
+            "1401": 5.35924,
+            "1402": 5.42148,
+            "1403": 5.20238,
+            "1404": 5.28629,
+            "1405": 5.11984,
+            "1406": 4.99128,
+            "1407": 5.40442,
+            "1408": 5.19825,
+            "1409": 5.3964,
+            "1410": 5.37519,
+            "1411": 4.91758,
+            "1412": 5.35561,
+            "1413": 5.41314,
+            "1414": 5.21823,
+            "1415": 5.44159,
+            "1416": 5.32905,
+            "1417": 5.38859,
+            "1418": 5.29946,
+            "1419": 5.31787,
+            "1420": 5.43974,
+            "1421": 5.39414,
+            "1422": 5.41749,
+            "1423": 5.005,
+            "1424": 5.32995,
+            "1425": 5.58618,
+            "1426": 5.23059,
+            "1427": 5.31804,
+            "1428": 5.33277,
+            "1429": 5.07552,
+            "1430": 5.33075,
+            "1431": 5.32688,
+            "1432": 5.33826,
+            "1433": 5.19107,
+            "1434": 5.16341,
+            "1435": 5.19905,
+            "1436": 5.10851,
+            "1437": 5.229,
+            "1438": 5.31867,
+            "1439": 5.34731,
+            "1440": 5.34991,
+            "1441": 5.16484,
+            "1442": 5.22015,
+            "1443": 5.20933,
+            "1444": 5.13701,
+            "1445": 5.07414,
+            "1446": 5.26836,
+            "1447": 5.25895,
+            "1448": 5.2904,
+            "1449": 5.2498,
+            "1450": 5.34281,
+            "1451": 5.07084,
+            "1452": 5.27052,
+            "1453": 5.1668,
+            "1454": 5.01539,
+            "1455": 5.12292,
+            "1456": 5.2717,
+            "1457": 5.18713,
+            "1458": 5.00608,
+            "1459": 5.22304,
+            "1460": 5.23389,
+            "1461": 5.07142,
+            "1462": 4.96923,
+            "1463": 5.14383,
+            "1464": 5.21128,
+            "1465": 5.26911,
+            "1466": 5.34961,
+            "1467": 5.33438,
+            "1468": 5.22205,
+            "1469": 5.04373,
+            "1470": 5.11715,
+            "1471": 5.25199,
+            "1472": 5.12294,
+            "1473": 5.10395,
+            "1474": 5.21775,
+            "1475": 5.18567,
+            "1476": 5.15287,
+            "1477": 5.26203,
+            "1478": 5.30399,
+            "1479": 5.01175,
+            "1480": 5.1809,
+            "1481": 5.24516,
+            "1482": 5.34866,
+            "1483": 5.26395,
+            "1484": 4.92397,
+            "1485": 5.29179,
+            "1486": 5.04178,
+            "1487": 4.88296,
+            "1488": 5.18145,
+            "1489": 5.10246,
+            "1490": 5.04399,
+            "1491": 5.31709,
+            "1492": 5.22469,
+            "1493": 4.94051,
+            "1494": 5.10929,
+            "1495": 5.13424,
+            "1496": 5.05862,
+            "1497": 5.36633,
+            "1498": 5.30967,
+            "1499": 5.13834,
+            "1500": 5.09851,
+            "1501": 5.03466,
+            "1502": 5.15527,
+            "1503": 5.43143,
+            "1504": 5.31968,
+            "1505": 5.00114,
+            "1506": 5.14444,
+            "1507": 5.16068,
+            "1508": 5.16575,
+            "1509": 5.31451,
+            "1510": 5.0185,
+            "1511": 5.11697,
+            "1512": 4.98287,
+            "1513": 5.16993,
+            "1514": 5.33962,
+            "1515": 5.36563,
+            "1516": 5.27715,
+            "1517": 5.22687,
+            "1518": 5.02626,
+            "1519": 5.29861,
+            "1520": 5.1417,
+            "1521": 5.15866,
+            "1522": 5.32824,
+            "1523": 5.24625,
+            "1524": 5.06725,
+            "1525": 5.20424,
+            "1526": 5.27994,
+            "1527": 5.25677,
+            "1528": 5.23589,
+            "1529": 5.18688,
+            "1530": 5.24365,
+            "1531": 5.09964,
+            "1532": 5.15141,
+            "1533": 5.05087,
+            "1534": 5.21589,
+            "1535": 5.1635,
+            "1536": 5.09678,
+            "1537": 5.02713,
+            "1538": 4.91184,
+            "1539": 5.23801,
+            "1540": 5.11515,
+            "1541": 5.25246,
+            "1542": 5.23484,
+            "1543": 5.05152,
+            "1544": 5.07544,
+            "1545": 5.1161,
+            "1546": 5.33085,
+            "1547": 5.11115,
+            "1548": 5.23527,
+            "1549": 5.23735,
+            "1550": 4.97596,
+            "1551": 5.2566,
+            "1552": 5.02944,
+            "1553": 5.14849,
+            "1554": 5.11205,
+            "1555": 5.10901,
+            "1556": 5.19824,
+            "1557": 5.08883,
+            "1558": 5.23067,
+            "1559": 5.00402,
+            "1560": 5.11835,
+            "1561": 5.14529,
+            "1562": 5.17996,
+            "1563": 5.24454,
+            "1564": 5.26389,
+            "1565": 5.08902,
+            "1566": 5.29474,
+            "1567": 5.04166,
+            "1568": 5.09256,
+            "1569": 5.20014,
+            "1570": 5.17348,
+            "1571": 4.95353,
+            "1572": 5.04005,
+            "1573": 5.02897,
+            "1574": 4.99751,
+            "1575": 5.2314,
+            "1576": 5.21263,
+            "1577": 5.12799,
+            "1578": 5.36241,
+            "1579": 4.94367,
+            "1580": 5.12197,
+            "1581": 5.09638,
+            "1582": 5.28497,
+            "1583": 5.04918,
+            "1584": 5.05482,
+            "1585": 5.11977,
+            "1586": 5.30243,
+            "1587": 5.13447,
+            "1588": 5.2184,
+            "1589": 4.83833,
+            "1590": 5.09497,
+            "1591": 5.17411,
+            "1592": 5.13721,
+            "1593": 5.23457,
+            "1594": 5.11805,
+            "1595": 5.10775,
+            "1596": 5.18964,
+            "1597": 5.11486,
+            "1598": 5.15917,
+            "1599": 5.19102,
+            "1600": 4.86871,
+            "1601": 5.11732,
+            "1602": 5.23185,
+            "1603": 5.19543,
+            "1604": 5.05128,
+            "1605": 5.02692,
+            "1606": 4.98659,
+            "1607": 5.07391,
+            "1608": 4.97985,
+            "1609": 5.07337,
+            "1610": 5.04745,
+            "1611": 4.99848,
+            "1612": 4.75205,
+            "1613": 5.03316,
+            "1614": 4.88034,
+            "1615": 5.07442,
+            "1616": 5.23082,
+            "1617": 5.06132,
+            "1618": 4.98704,
+            "1619": 5.18333,
+            "1620": 5.14491,
+            "1621": 5.31452,
+            "1622": 5.05677,
+            "1623": 5.14346,
+            "1624": 5.1355,
+            "1625": 5.12006,
+            "1626": 5.10245,
+            "1627": 5.10987,
+            "1628": 5.06581,
+            "1629": 4.92971,
+            "1630": 5.06799,
+            "1631": 5.06088,
+            "1632": 5.10428,
+            "1633": 4.97515,
+            "1634": 4.9235,
+            "1635": 5.05833,
+            "1636": 4.92289,
+            "1637": 5.24051,
+            "1638": 5.15574,
+            "1639": 4.977,
+            "1640": 5.00918,
+            "1641": 5.12718,
+            "1642": 5.08305,
+            "1643": 5.04894,
+            "1644": 5.1181,
+            "1645": 4.96677,
+            "1646": 5.11931,
+            "1647": 5.03295,
+            "1648": 5.19969,
+            "1649": 4.92396,
+            "1650": 5.05963,
+            "1651": 4.92965,
+            "1652": 5.21121,
+            "1653": 5.15959,
+            "1654": 5.12828,
+            "1655": 5.16263,
+            "1656": 5.34595,
+            "1657": 5.20677,
+            "1658": 5.04112,
+            "1659": 4.9258,
+            "1660": 4.80954,
+            "1661": 5.03086,
+            "1662": 5.14123,
+            "1663": 5.15449,
+            "1664": 4.981,
+            "1665": 5.11714,
+            "1666": 5.10575,
+            "1667": 4.84897,
+            "1668": 5.11513,
+            "1669": 5.06995,
+            "1670": 5.11266,
+            "1671": 5.17201,
+            "1672": 4.77569,
+            "1673": 5.03851,
+            "1674": 4.91569,
+            "1675": 5.05176,
+            "1676": 5.00402,
+            "1677": 4.79944,
+            "1678": 5.02487,
+            "1679": 4.89421,
+            "1680": 5.03847,
+            "1681": 5.06815,
+            "1682": 5.03274,
+            "1683": 4.90688,
+            "1684": 5.06515,
+            "1685": 5.13579,
+            "1686": 5.0732,
+            "1687": 4.97656,
+            "1688": 5.16537,
+            "1689": 5.14707,
+            "1690": 4.99688,
+            "1691": 5.00011,
+            "1692": 4.91822,
+            "1693": 5.01472,
+            "1694": 4.94657,
+            "1695": 4.91341,
+            "1696": 5.08209,
+            "1697": 5.04294,
+            "1698": 4.9511,
+            "1699": 5.00187,
+            "1700": 4.95393,
+            "1701": 5.16563,
+            "1702": 5.07666,
+            "1703": 5.17125,
+            "1704": 5.14332,
+            "1705": 4.96247,
+            "1706": 4.98333,
+            "1707": 4.79005,
+            "1708": 5.03831,
+            "1709": 5.23334,
+            "1710": 5.02934,
+            "1711": 5.19037,
+            "1712": 5.1958,
+            "1713": 5.03582,
+            "1714": 5.04603,
+            "1715": 4.91495,
+            "1716": 4.9332,
+            "1717": 4.86109,
+            "1718": 5.0273,
+            "1719": 5.12334,
+            "1720": 5.02189,
+            "1721": 4.92752,
+            "1722": 5.05412,
+            "1723": 4.93537,
+            "1724": 5.0407,
+            "1725": 5.1914,
+            "1726": 5.06447,
+            "1727": 4.90742,
+            "1728": 5.02116,
+            "1729": 5.04574,
+            "1730": 4.90343,
+            "1731": 4.99945,
+            "1732": 4.92083,
+            "1733": 5.1311,
+            "1734": 4.82837,
+            "1735": 5.20905,
+            "1736": 4.91585,
+            "1737": 4.85859,
+            "1738": 4.97909,
+            "1739": 5.16688,
+            "1740": 4.83514,
+            "1741": 4.77896,
+            "1742": 4.90909,
+            "1743": 5.08523,
+            "1744": 4.9784,
+            "1745": 4.82327,
+            "1746": 4.94833,
+            "1747": 4.87022,
+            "1748": 5.06379,
+            "1749": 4.8705,
+            "1750": 5.01347,
+            "1751": 5.12189,
+            "1752": 4.90364,
+            "1753": 5.09398,
+            "1754": 5.05918,
+            "1755": 4.89649,
+            "1756": 5.02243,
+            "1757": 5.14389,
+            "1758": 4.8716,
+            "1759": 4.94237,
+            "1760": 4.83366,
+            "1761": 5.02233,
+            "1762": 4.81292,
+            "1763": 4.77382,
+            "1764": 4.93787,
+            "1765": 5.14977,
+            "1766": 5.33847,
+            "1767": 5.22339,
+            "1768": 4.95072,
+            "1769": 5.00607,
+            "1770": 4.98077,
+            "1771": 4.96436,
+            "1772": 4.98395,
+            "1773": 4.97312,
+            "1774": 4.86859,
+            "1775": 4.95207,
+            "1776": 4.99761,
+            "1777": 4.94332,
+            "1778": 4.99268,
+            "1779": 5.08376,
+            "1780": 4.83276,
+            "1781": 5.05321,
+            "1782": 4.9968,
+            "1783": 5.01268,
+            "1784": 4.93195,
+            "1785": 5.16736,
+            "1786": 4.81265,
+            "1787": 4.97081,
+            "1788": 4.82725,
+            "1789": 4.88846,
+            "1790": 4.79821,
+            "1791": 4.73741,
+            "1792": 4.87626,
+            "1793": 5.10356,
+            "1794": 4.98084,
+            "1795": 4.96551,
+            "1796": 4.99704,
+            "1797": 4.7903,
+            "1798": 4.76702,
+            "1799": 5.01884,
+            "1800": 4.91364,
+            "1801": 5.04679,
+            "1802": 4.82665,
+            "1803": 4.95171,
+            "1804": 4.88594,
+            "1805": 4.90346,
+            "1806": 4.87351,
+            "1807": 4.92406,
+            "1808": 4.92697,
+            "1809": 5.1451,
+            "1810": 5.09976,
+            "1811": 4.95906,
+            "1812": 4.80139,
+            "1813": 5.09748,
+            "1814": 4.77766,
+            "1815": 4.86134,
+            "1816": 5.05005,
+            "1817": 4.79012,
+            "1818": 4.80376,
+            "1819": 5.02382,
+            "1820": 4.68652,
+            "1821": 5.02661,
+            "1822": 4.66251,
+            "1823": 4.8659,
+            "1824": 4.78635,
+            "1825": 5.06537,
+            "1826": 4.81944,
+            "1827": 4.7895,
+            "1828": 4.94677,
+            "1829": 5.11262,
+            "1830": 4.91236,
+            "1831": 4.89818,
+            "1832": 4.83359,
+            "1833": 4.78363,
+            "1834": 4.9482,
+            "1835": 4.95795,
+            "1836": 4.90747,
+            "1837": 4.67243,
+            "1838": 4.80953,
+            "1839": 4.89546,
+            "1840": 4.90488,
+            "1841": 4.8292,
+            "1842": 4.94678,
+            "1843": 4.70293,
+            "1844": 4.61431,
+            "1845": 5.00086,
+            "1846": 4.74657,
+            "1847": 4.8645,
+            "1848": 4.89695,
+            "1849": 4.85358,
+            "1850": 4.8676,
+            "1851": 5.02236,
+            "1852": 4.97647,
+            "1853": 4.83325,
+            "1854": 4.86791,
+            "1855": 4.8219,
+            "1856": 4.75614,
+            "1857": 4.9619,
+            "1858": 4.96856,
+            "1859": 4.75323,
+            "1860": 4.86592,
+            "1861": 5.20685,
+            "1862": 4.61669,
+            "1863": 4.83385,
+            "1864": 4.7505,
+            "1865": 4.86441,
+            "1866": 4.79455,
+            "1867": 4.99688,
+            "1868": 4.71331,
+            "1869": 4.75634,
+            "1870": 4.93203,
+            "1871": 4.99184,
+            "1872": 4.68332,
+            "1873": 4.69823,
+            "1874": 4.85174,
+            "1875": 4.85999,
+            "1876": 4.7392,
+            "1877": 4.80362,
+            "1878": 4.81239,
+            "1879": 4.82084,
+            "1880": 4.89314,
+            "1881": 4.79389,
+            "1882": 4.79419,
+            "1883": 4.78157,
+            "1884": 4.97086,
+            "1885": 4.91799,
+            "1886": 4.82203,
+            "1887": 4.81334,
+            "1888": 4.97395,
+            "1889": 4.95922,
+            "1890": 4.70676,
+            "1891": 4.65282,
+            "1892": 4.84393,
+            "1893": 4.64594,
+            "1894": 4.90265,
+            "1895": 4.7886,
+            "1896": 4.66112,
+            "1897": 4.78966,
+            "1898": 4.9139,
+            "1899": 4.77532,
+            "1900": 4.91571,
+            "1901": 4.84525,
+            "1902": 4.78411,
+            "1903": 4.75997,
+            "1904": 4.65339,
+            "1905": 4.54188,
+            "1906": 4.81097,
+            "1907": 4.90225,
+            "1908": 5.03012,
+            "1909": 4.88434,
+            "1910": 4.78852,
+            "1911": 4.80477,
+            "1912": 4.64685,
+            "1913": 4.94065,
+            "1914": 4.87965,
+            "1915": 4.85906,
+            "1916": 4.92227,
+            "1917": 4.85425,
+            "1918": 4.87001,
+            "1919": 4.99304,
+            "1920": 4.76319,
+            "1921": 4.88494,
+            "1922": 4.81295,
+            "1923": 4.7592,
+            "1924": 4.82501,
+            "1925": 5.05793,
+            "1926": 4.92996,
+            "1927": 4.92587,
+            "1928": 4.92702,
+            "1929": 4.92705,
+            "1930": 4.91019,
+            "1931": 4.77616,
+            "1932": 4.85963,
+            "1933": 4.83545,
+            "1934": 4.84013,
+            "1935": 5.10729,
+            "1936": 4.88314,
+            "1937": 4.87654,
+            "1938": 4.79463,
+            "1939": 4.71148,
+            "1940": 4.82418,
+            "1941": 4.73372,
+            "1942": 4.87249,
+            "1943": 4.7353,
+            "1944": 4.74198,
+            "1945": 4.6818,
+            "1946": 4.91539,
+            "1947": 4.86756,
+            "1948": 4.59887,
+            "1949": 4.90387,
+            "1950": 4.78785,
+            "1951": 4.95942,
+            "1952": 4.73677,
+            "1953": 4.79496,
+            "1954": 4.73264,
+            "1955": 4.84308,
+            "1956": 4.88233,
+            "1957": 4.73496,
+            "1958": 4.70018,
+            "1959": 4.75966,
+            "1960": 4.76849,
+            "1961": 4.7146,
+            "1962": 4.83392,
+            "1963": 4.82321,
+            "1964": 4.84664,
+            "1965": 4.87523,
+            "1966": 4.78753,
+            "1967": 4.59211,
+            "1968": 4.82724,
+            "1969": 4.59184,
+            "1970": 4.56633,
+            "1971": 4.9072,
+            "1972": 4.90064,
+            "1973": 4.54642,
+            "1974": 4.82423,
+            "1975": 4.82778,
+            "1976": 4.71327,
+            "1977": 4.57967,
+            "1978": 5.0045,
+            "1979": 4.66094,
+            "1980": 4.74256,
+            "1981": 4.86301,
+            "1982": 4.72234,
+            "1983": 4.8786,
+            "1984": 4.64152,
+            "1985": 4.78,
+            "1986": 4.70167,
+            "1987": 4.81036,
+            "1988": 4.8871,
+            "1989": 4.63185,
+            "1990": 4.79636,
+            "1991": 4.69424,
+            "1992": 4.79439,
+            "1993": 4.74063,
+            "1994": 4.84977,
+            "1995": 4.5596,
+            "1996": 4.65161,
+            "1997": 4.80342,
+            "1998": 4.67403,
+            "1999": 4.72284,
+            "2000": 4.61765
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 80.0,
+            "2": 70.0,
+            "3": 78.0,
+            "4": 80.0,
+            "5": 75.0,
+            "6": 87.0,
+            "7": 63.0,
+            "8": 77.0,
+            "9": 62.0,
+            "10": 90.0,
+            "11": 74.0,
+            "12": 79.0,
+            "13": 77.0,
+            "14": 83.0,
+            "15": 78.0,
+            "16": 69.0,
+            "17": 64.0,
+            "18": 63.0,
+            "19": 87.0,
+            "20": 90.0,
+            "21": 75.0,
+            "22": 84.0,
+            "23": 81.0,
+            "24": 78.0,
+            "25": 87.0,
+            "26": 69.0,
+            "27": 86.0,
+            "28": 91.0,
+            "29": 94.0,
+            "30": 115.0,
+            "31": 99.0,
+            "32": 109.0,
+            "33": 92.0,
+            "34": 103.0,
+            "35": 118.0,
+            "36": 117.0,
+            "37": 105.0,
+            "38": 129.0,
+            "39": 89.0,
+            "40": 129.0,
+            "41": 114.0,
+            "42": 121.0,
+            "43": 135.0,
+            "44": 128.0,
+            "45": 126.0,
+            "46": 129.0,
+            "47": 133.0,
+            "48": 139.0,
+            "49": 135.0,
+            "50": 157.0,
+            "51": 122.0,
+            "52": 150.0,
+            "53": 108.0,
+            "54": 140.0,
+            "55": 133.0,
+            "56": 156.0,
+            "57": 150.0,
+            "58": 153.0,
+            "59": 135.0,
+            "60": 135.0,
+            "61": 165.0,
+            "62": 145.0,
+            "63": 199.0,
+            "64": 161.0,
+            "65": 162.0,
+            "66": 162.0,
+            "67": 195.0,
+            "68": 140.0,
+            "69": 158.0,
+            "70": 169.0,
+            "71": 188.0,
+            "72": 160.0,
+            "73": 151.0,
+            "74": 154.0,
+            "75": 172.0,
+            "76": 169.0,
+            "77": 165.0,
+            "78": 193.0,
+            "79": 144.0,
+            "80": 173.0,
+            "81": 150.0,
+            "82": 141.0,
+            "83": 186.0,
+            "84": 169.0,
+            "85": 183.0,
+            "86": 196.0,
+            "87": 197.0,
+            "88": 184.0,
+            "89": 169.0,
+            "90": 182.0,
+            "91": 200.0,
+            "92": 179.0,
+            "93": 165.0,
+            "94": 153.0,
+            "95": 176.0,
+            "96": 191.0,
+            "97": 183.0,
+            "98": 199.0,
+            "99": 163.0,
+            "100": 157.0,
+            "101": 144.0,
+            "102": 184.0,
+            "103": 206.0,
+            "104": 171.0,
+            "105": 215.0,
+            "106": 176.0,
+            "107": 172.0,
+            "108": 172.0,
+            "109": 172.0,
+            "110": 216.0,
+            "111": 182.0,
+            "112": 172.0,
+            "113": 167.0,
+            "114": 192.0,
+            "115": 175.0,
+            "116": 181.0,
+            "117": 177.0,
+            "118": 142.0,
+            "119": 212.0,
+            "120": 164.0,
+            "121": 193.0,
+            "122": 160.0,
+            "123": 169.0,
+            "124": 191.0,
+            "125": 214.0,
+            "126": 160.0,
+            "127": 192.0,
+            "128": 160.0,
+            "129": 180.0,
+            "130": 214.0,
+            "131": 219.0,
+            "132": 173.0,
+            "133": 166.0,
+            "134": 171.0,
+            "135": 182.0,
+            "136": 172.0,
+            "137": 176.0,
+            "138": 174.0,
+            "139": 161.0,
+            "140": 178.0,
+            "141": 164.0,
+            "142": 159.0,
+            "143": 192.0,
+            "144": 157.0,
+            "145": 144.0,
+            "146": 149.0,
+            "147": 148.0,
+            "148": 169.0,
+            "149": 143.0,
+            "150": 111.0,
+            "151": 159.0,
+            "152": 115.0,
+            "153": 147.0,
+            "154": 162.0,
+            "155": 185.0,
+            "156": 144.0,
+            "157": 147.0,
+            "158": 130.0,
+            "159": 165.0,
+            "160": 190.0,
+            "161": 141.0,
+            "162": 155.0,
+            "163": 140.0,
+            "164": 174.0,
+            "165": 168.0,
+            "166": 179.0,
+            "167": 147.0,
+            "168": 138.0,
+            "169": 161.0,
+            "170": 159.0,
+            "171": 125.0,
+            "172": 193.0,
+            "173": 172.0,
+            "174": 190.0,
+            "175": 192.0,
+            "176": 146.0,
+            "177": 168.0,
+            "178": 172.0,
+            "179": 177.0,
+            "180": 148.0,
+            "181": 161.0,
+            "182": 213.0,
+            "183": 215.0,
+            "184": 201.0,
+            "185": 154.0,
+            "186": 207.0,
+            "187": 175.0,
+            "188": 183.0,
+            "189": 169.0,
+            "190": 167.0,
+            "191": 163.0,
+            "192": 193.0,
+            "193": 169.0,
+            "194": 161.0,
+            "195": 141.0,
+            "196": 174.0,
+            "197": 188.0,
+            "198": 168.0,
+            "199": 150.0,
+            "200": 187.0,
+            "201": 173.0,
+            "202": 183.0,
+            "203": 142.0,
+            "204": 177.0,
+            "205": 153.0,
+            "206": 198.0,
+            "207": 168.0,
+            "208": 140.0,
+            "209": 179.0,
+            "210": 175.0,
+            "211": 167.0,
+            "212": 194.0,
+            "213": 192.0,
+            "214": 174.0,
+            "215": 188.0,
+            "216": 164.0,
+            "217": 170.0,
+            "218": 171.0,
+            "219": 211.0,
+            "220": 195.0,
+            "221": 181.0,
+            "222": 154.0,
+            "223": 176.0,
+            "224": 173.0,
+            "225": 166.0,
+            "226": 174.0,
+            "227": 211.0,
+            "228": 146.0,
+            "229": 193.0,
+            "230": 149.0,
+            "231": 177.0,
+            "232": 169.0,
+            "233": 193.0,
+            "234": 183.0,
+            "235": 215.0,
+            "236": 200.0,
+            "237": 218.0,
+            "238": 179.0,
+            "239": 139.0,
+            "240": 217.0,
+            "241": 174.0,
+            "242": 193.0,
+            "243": 192.0,
+            "244": 181.0,
+            "245": 206.0,
+            "246": 221.0,
+            "247": 219.0,
+            "248": 175.0,
+            "249": 189.0,
+            "250": 156.0,
+            "251": 205.0,
+            "252": 164.0,
+            "253": 172.0,
+            "254": 184.0,
+            "255": 218.0,
+            "256": 171.0,
+            "257": 208.0,
+            "258": 210.0,
+            "259": 174.0,
+            "260": 199.0,
+            "261": 178.0,
+            "262": 185.0,
+            "263": 181.0,
+            "264": 200.0,
+            "265": 171.0,
+            "266": 149.0,
+            "267": 141.0,
+            "268": 186.0,
+            "269": 198.0,
+            "270": 170.0,
+            "271": 168.0,
+            "272": 210.0,
+            "273": 151.0,
+            "274": 212.0,
+            "275": 182.0,
+            "276": 172.0,
+            "277": 159.0,
+            "278": 169.0,
+            "279": 185.0,
+            "280": 174.0,
+            "281": 160.0,
+            "282": 171.0,
+            "283": 174.0,
+            "284": 183.0,
+            "285": 169.0,
+            "286": 173.0,
+            "287": 203.0,
+            "288": 168.0,
+            "289": 202.0,
+            "290": 157.0,
+            "291": 241.0,
+            "292": 172.0,
+            "293": 209.0,
+            "294": 194.0,
+            "295": 207.0,
+            "296": 217.0,
+            "297": 160.0,
+            "298": 126.0,
+            "299": 170.0,
+            "300": 177.0,
+            "301": 189.0,
+            "302": 209.0,
+            "303": 170.0,
+            "304": 177.0,
+            "305": 148.0,
+            "306": 172.0,
+            "307": 213.0,
+            "308": 184.0,
+            "309": 193.0,
+            "310": 218.0,
+            "311": 159.0,
+            "312": 178.0,
+            "313": 177.0,
+            "314": 199.0,
+            "315": 165.0,
+            "316": 168.0,
+            "317": 185.0,
+            "318": 261.0,
+            "319": 181.0,
+            "320": 196.0,
+            "321": 200.0,
+            "322": 217.0,
+            "323": 198.0,
+            "324": 200.0,
+            "325": 184.0,
+            "326": 283.0,
+            "327": 211.0,
+            "328": 231.0,
+            "329": 189.0,
+            "330": 248.0,
+            "331": 205.0,
+            "332": 208.0,
+            "333": 199.0,
+            "334": 182.0,
+            "335": 202.0,
+            "336": 207.0,
+            "337": 216.0,
+            "338": 231.0,
+            "339": 213.0,
+            "340": 240.0,
+            "341": 207.0,
+            "342": 153.0,
+            "343": 264.0,
+            "344": 214.0,
+            "345": 202.0,
+            "346": 183.0,
+            "347": 194.0,
+            "348": 216.0,
+            "349": 206.0,
+            "350": 218.0,
+            "351": 218.0,
+            "352": 207.0,
+            "353": 225.0,
+            "354": 213.0,
+            "355": 201.0,
+            "356": 227.0,
+            "357": 217.0,
+            "358": 206.0,
+            "359": 186.0,
+            "360": 217.0,
+            "361": 187.0,
+            "362": 256.0,
+            "363": 226.0,
+            "364": 203.0,
+            "365": 200.0,
+            "366": 241.0,
+            "367": 205.0,
+            "368": 192.0,
+            "369": 160.0,
+            "370": 221.0,
+            "371": 212.0,
+            "372": 193.0,
+            "373": 218.0,
+            "374": 164.0,
+            "375": 249.0,
+            "376": 195.0,
+            "377": 197.0,
+            "378": 222.0,
+            "379": 254.0,
+            "380": 210.0,
+            "381": 199.0,
+            "382": 217.0,
+            "383": 208.0,
+            "384": 238.0,
+            "385": 183.0,
+            "386": 221.0,
+            "387": 185.0,
+            "388": 205.0,
+            "389": 185.0,
+            "390": 217.0,
+            "391": 241.0,
+            "392": 212.0,
+            "393": 247.0,
+            "394": 242.0,
+            "395": 247.0,
+            "396": 197.0,
+            "397": 202.0,
+            "398": 191.0,
+            "399": 231.0,
+            "400": 211.0,
+            "401": 200.0,
+            "402": 210.0,
+            "403": 261.0,
+            "404": 211.0,
+            "405": 171.0,
+            "406": 209.0,
+            "407": 200.0,
+            "408": 226.0,
+            "409": 200.0,
+            "410": 220.0,
+            "411": 196.0,
+            "412": 194.0,
+            "413": 168.0,
+            "414": 223.0,
+            "415": 204.0,
+            "416": 225.0,
+            "417": 213.0,
+            "418": 196.0,
+            "419": 203.0,
+            "420": 203.0,
+            "421": 217.0,
+            "422": 200.0,
+            "423": 213.0,
+            "424": 237.0,
+            "425": 239.0,
+            "426": 178.0,
+            "427": 213.0,
+            "428": 196.0,
+            "429": 174.0,
+            "430": 243.0,
+            "431": 169.0,
+            "432": 203.0,
+            "433": 211.0,
+            "434": 194.0,
+            "435": 188.0,
+            "436": 208.0,
+            "437": 170.0,
+            "438": 194.0,
+            "439": 156.0,
+            "440": 199.0,
+            "441": 190.0,
+            "442": 232.0,
+            "443": 225.0,
+            "444": 172.0,
+            "445": 194.0,
+            "446": 221.0,
+            "447": 209.0,
+            "448": 233.0,
+            "449": 257.0,
+            "450": 207.0,
+            "451": 199.0,
+            "452": 177.0,
+            "453": 200.0,
+            "454": 227.0,
+            "455": 263.0,
+            "456": 196.0,
+            "457": 204.0,
+            "458": 169.0,
+            "459": 131.0,
+            "460": 216.0,
+            "461": 223.0,
+            "462": 210.0,
+            "463": 203.0,
+            "464": 208.0,
+            "465": 187.0,
+            "466": 190.0,
+            "467": 192.0,
+            "468": 194.0,
+            "469": 188.0,
+            "470": 193.0,
+            "471": 221.0,
+            "472": 166.0,
+            "473": 191.0,
+            "474": 193.0,
+            "475": 196.0,
+            "476": 192.0,
+            "477": 168.0,
+            "478": 180.0,
+            "479": 176.0,
+            "480": 145.0,
+            "481": 197.0,
+            "482": 167.0,
+            "483": 198.0,
+            "484": 172.0,
+            "485": 175.0,
+            "486": 192.0,
+            "487": 143.0,
+            "488": 182.0,
+            "489": 172.0,
+            "490": 178.0,
+            "491": 175.0,
+            "492": 194.0,
+            "493": 211.0,
+            "494": 159.0,
+            "495": 165.0,
+            "496": 153.0,
+            "497": 145.0,
+            "498": 196.0,
+            "499": 195.0,
+            "500": 165.0,
+            "501": 183.0,
+            "502": 167.0,
+            "503": 175.0,
+            "504": 182.0,
+            "505": 212.0,
+            "506": 177.0,
+            "507": 159.0,
+            "508": 135.0,
+            "509": 195.0,
+            "510": 156.0,
+            "511": 186.0,
+            "512": 177.0,
+            "513": 186.0,
+            "514": 173.0,
+            "515": 190.0,
+            "516": 175.0,
+            "517": 143.0,
+            "518": 169.0,
+            "519": 186.0,
+            "520": 156.0,
+            "521": 146.0,
+            "522": 173.0,
+            "523": 175.0,
+            "524": 172.0,
+            "525": 202.0,
+            "526": 168.0,
+            "527": 178.0,
+            "528": 173.0,
+            "529": 183.0,
+            "530": 168.0,
+            "531": 161.0,
+            "532": 185.0,
+            "533": 172.0,
+            "534": 166.0,
+            "535": 140.0,
+            "536": 164.0,
+            "537": 150.0,
+            "538": 155.0,
+            "539": 125.0,
+            "540": 151.0,
+            "541": 130.0,
+            "542": 153.0,
+            "543": 149.0,
+            "544": 185.0,
+            "545": 132.0,
+            "546": 184.0,
+            "547": 150.0,
+            "548": 155.0,
+            "549": 162.0,
+            "550": 170.0,
+            "551": 144.0,
+            "552": 147.0,
+            "553": 213.0,
+            "554": 182.0,
+            "555": 150.0,
+            "556": 162.0,
+            "557": 154.0,
+            "558": 181.0,
+            "559": 144.0,
+            "560": 194.0,
+            "561": 174.0,
+            "562": 147.0,
+            "563": 125.0,
+            "564": 169.0,
+            "565": 143.0,
+            "566": 136.0,
+            "567": 144.0,
+            "568": 153.0,
+            "569": 167.0,
+            "570": 153.0,
+            "571": 131.0,
+            "572": 143.0,
+            "573": 128.0,
+            "574": 162.0,
+            "575": 133.0,
+            "576": 143.0,
+            "577": 171.0,
+            "578": 167.0,
+            "579": 140.0,
+            "580": 165.0,
+            "581": 164.0,
+            "582": 145.0,
+            "583": 151.0,
+            "584": 146.0,
+            "585": 148.0,
+            "586": 102.0,
+            "587": 147.0,
+            "588": 146.0,
+            "589": 123.0,
+            "590": 146.0,
+            "591": 149.0,
+            "592": 115.0,
+            "593": 166.0,
+            "594": 159.0,
+            "595": 127.0,
+            "596": 113.0,
+            "597": 135.0,
+            "598": 139.0,
+            "599": 157.0,
+            "600": 129.0,
+            "601": 144.0,
+            "602": 129.0,
+            "603": 125.0,
+            "604": 125.0,
+            "605": 139.0,
+            "606": 135.0,
+            "607": 144.0,
+            "608": 149.0,
+            "609": 139.0,
+            "610": 135.0,
+            "611": 148.0,
+            "612": 148.0,
+            "613": 115.0,
+            "614": 150.0,
+            "615": 132.0,
+            "616": 156.0,
+            "617": 120.0,
+            "618": 145.0,
+            "619": 136.0,
+            "620": 170.0,
+            "621": 147.0,
+            "622": 150.0,
+            "623": 119.0,
+            "624": 128.0,
+            "625": 141.0,
+            "626": 122.0,
+            "627": 121.0,
+            "628": 157.0,
+            "629": 126.0,
+            "630": 134.0,
+            "631": 147.0,
+            "632": 146.0,
+            "633": 131.0,
+            "634": 145.0,
+            "635": 174.0,
+            "636": 151.0,
+            "637": 169.0,
+            "638": 128.0,
+            "639": 164.0,
+            "640": 145.0,
+            "641": 136.0,
+            "642": 132.0,
+            "643": 134.0,
+            "644": 124.0,
+            "645": 145.0,
+            "646": 106.0,
+            "647": 123.0,
+            "648": 121.0,
+            "649": 134.0,
+            "650": 153.0,
+            "651": 117.0,
+            "652": 163.0,
+            "653": 155.0,
+            "654": 140.0,
+            "655": 154.0,
+            "656": 124.0,
+            "657": 116.0,
+            "658": 130.0,
+            "659": 114.0,
+            "660": 145.0,
+            "661": 121.0,
+            "662": 143.0,
+            "663": 124.0,
+            "664": 139.0,
+            "665": 138.0,
+            "666": 111.0,
+            "667": 127.0,
+            "668": 144.0,
+            "669": 116.0,
+            "670": 139.0,
+            "671": 132.0,
+            "672": 136.0,
+            "673": 139.0,
+            "674": 119.0,
+            "675": 165.0,
+            "676": 123.0,
+            "677": 127.0,
+            "678": 135.0,
+            "679": 83.0,
+            "680": 139.0,
+            "681": 120.0,
+            "682": 111.0,
+            "683": 119.0,
+            "684": 121.0,
+            "685": 145.0,
+            "686": 127.0,
+            "687": 145.0,
+            "688": 117.0,
+            "689": 119.0,
+            "690": 119.0,
+            "691": 124.0,
+            "692": 118.0,
+            "693": 112.0,
+            "694": 156.0,
+            "695": 114.0,
+            "696": 141.0,
+            "697": 123.0,
+            "698": 130.0,
+            "699": 147.0,
+            "700": 119.0,
+            "701": 139.0,
+            "702": 111.0,
+            "703": 113.0,
+            "704": 118.0,
+            "705": 115.0,
+            "706": 102.0,
+            "707": 121.0,
+            "708": 115.0,
+            "709": 116.0,
+            "710": 95.0,
+            "711": 101.0,
+            "712": 98.0,
+            "713": 117.0,
+            "714": 127.0,
+            "715": 135.0,
+            "716": 124.0,
+            "717": 88.0,
+            "718": 143.0,
+            "719": 114.0,
+            "720": 120.0,
+            "721": 106.0,
+            "722": 117.0,
+            "723": 101.0,
+            "724": 97.0,
+            "725": 106.0,
+            "726": 103.0,
+            "727": 95.0,
+            "728": 123.0,
+            "729": 104.0,
+            "730": 124.0,
+            "731": 111.0,
+            "732": 78.0,
+            "733": 96.0,
+            "734": 129.0,
+            "735": 142.0,
+            "736": 110.0,
+            "737": 132.0,
+            "738": 110.0,
+            "739": 136.0,
+            "740": 106.0,
+            "741": 102.0,
+            "742": 123.0,
+            "743": 133.0,
+            "744": 130.0,
+            "745": 109.0,
+            "746": 122.0,
+            "747": 125.0,
+            "748": 133.0,
+            "749": 114.0,
+            "750": 121.0,
+            "751": 113.0,
+            "752": 111.0,
+            "753": 96.0,
+            "754": 118.0,
+            "755": 87.0,
+            "756": 113.0,
+            "757": 91.0,
+            "758": 105.0,
+            "759": 99.0,
+            "760": 125.0,
+            "761": 106.0,
+            "762": 105.0,
+            "763": 101.0,
+            "764": 109.0,
+            "765": 118.0,
+            "766": 95.0,
+            "767": 133.0,
+            "768": 115.0,
+            "769": 122.0,
+            "770": 106.0,
+            "771": 123.0,
+            "772": 106.0,
+            "773": 136.0,
+            "774": 128.0,
+            "775": 116.0,
+            "776": 112.0,
+            "777": 95.0,
+            "778": 113.0,
+            "779": 119.0,
+            "780": 99.0,
+            "781": 107.0,
+            "782": 80.0,
+            "783": 108.0,
+            "784": 122.0,
+            "785": 111.0,
+            "786": 111.0,
+            "787": 115.0,
+            "788": 116.0,
+            "789": 108.0,
+            "790": 127.0,
+            "791": 83.0,
+            "792": 117.0,
+            "793": 102.0,
+            "794": 106.0,
+            "795": 123.0,
+            "796": 121.0,
+            "797": 124.0,
+            "798": 112.0,
+            "799": 136.0,
+            "800": 99.0,
+            "801": 117.0,
+            "802": 93.0,
+            "803": 166.0,
+            "804": 127.0,
+            "805": 124.0,
+            "806": 97.0,
+            "807": 134.0,
+            "808": 108.0,
+            "809": 121.0,
+            "810": 126.0,
+            "811": 107.0,
+            "812": 116.0,
+            "813": 126.0,
+            "814": 105.0,
+            "815": 98.0,
+            "816": 99.0,
+            "817": 97.0,
+            "818": 97.0,
+            "819": 109.0,
+            "820": 106.0,
+            "821": 88.0,
+            "822": 109.0,
+            "823": 108.0,
+            "824": 127.0,
+            "825": 108.0,
+            "826": 128.0,
+            "827": 134.0,
+            "828": 100.0,
+            "829": 125.0,
+            "830": 113.0,
+            "831": 114.0,
+            "832": 107.0,
+            "833": 113.0,
+            "834": 100.0,
+            "835": 98.0,
+            "836": 123.0,
+            "837": 95.0,
+            "838": 118.0,
+            "839": 96.0,
+            "840": 109.0,
+            "841": 98.0,
+            "842": 114.0,
+            "843": 113.0,
+            "844": 123.0,
+            "845": 108.0,
+            "846": 124.0,
+            "847": 112.0,
+            "848": 115.0,
+            "849": 118.0,
+            "850": 92.0,
+            "851": 145.0,
+            "852": 89.0,
+            "853": 106.0,
+            "854": 101.0,
+            "855": 113.0,
+            "856": 125.0,
+            "857": 105.0,
+            "858": 129.0,
+            "859": 107.0,
+            "860": 118.0,
+            "861": 85.0,
+            "862": 106.0,
+            "863": 95.0,
+            "864": 81.0,
+            "865": 104.0,
+            "866": 105.0,
+            "867": 104.0,
+            "868": 106.0,
+            "869": 109.0,
+            "870": 105.0,
+            "871": 122.0,
+            "872": 114.0,
+            "873": 100.0,
+            "874": 113.0,
+            "875": 108.0,
+            "876": 93.0,
+            "877": 130.0,
+            "878": 110.0,
+            "879": 122.0,
+            "880": 106.0,
+            "881": 103.0,
+            "882": 80.0,
+            "883": 107.0,
+            "884": 115.0,
+            "885": 113.0,
+            "886": 116.0,
+            "887": 131.0,
+            "888": 89.0,
+            "889": 120.0,
+            "890": 110.0,
+            "891": 103.0,
+            "892": 102.0,
+            "893": 106.0,
+            "894": 91.0,
+            "895": 118.0,
+            "896": 110.0,
+            "897": 103.0,
+            "898": 115.0,
+            "899": 119.0,
+            "900": 120.0,
+            "901": 99.0,
+            "902": 100.0,
+            "903": 102.0,
+            "904": 127.0,
+            "905": 105.0,
+            "906": 124.0,
+            "907": 104.0,
+            "908": 117.0,
+            "909": 124.0,
+            "910": 108.0,
+            "911": 102.0,
+            "912": 117.0,
+            "913": 122.0,
+            "914": 130.0,
+            "915": 98.0,
+            "916": 120.0,
+            "917": 113.0,
+            "918": 112.0,
+            "919": 85.0,
+            "920": 110.0,
+            "921": 108.0,
+            "922": 111.0,
+            "923": 116.0,
+            "924": 119.0,
+            "925": 105.0,
+            "926": 128.0,
+            "927": 120.0,
+            "928": 106.0,
+            "929": 94.0,
+            "930": 116.0,
+            "931": 102.0,
+            "932": 123.0,
+            "933": 114.0,
+            "934": 133.0,
+            "935": 86.0,
+            "936": 114.0,
+            "937": 96.0,
+            "938": 118.0,
+            "939": 111.0,
+            "940": 110.0,
+            "941": 102.0,
+            "942": 98.0,
+            "943": 119.0,
+            "944": 107.0,
+            "945": 106.0,
+            "946": 112.0,
+            "947": 93.0,
+            "948": 119.0,
+            "949": 116.0,
+            "950": 124.0,
+            "951": 112.0,
+            "952": 106.0,
+            "953": 97.0,
+            "954": 111.0,
+            "955": 112.0,
+            "956": 87.0,
+            "957": 117.0,
+            "958": 97.0,
+            "959": 91.0,
+            "960": 103.0,
+            "961": 102.0,
+            "962": 103.0,
+            "963": 127.0,
+            "964": 113.0,
+            "965": 120.0,
+            "966": 106.0,
+            "967": 104.0,
+            "968": 119.0,
+            "969": 89.0,
+            "970": 121.0,
+            "971": 115.0,
+            "972": 96.0,
+            "973": 90.0,
+            "974": 113.0,
+            "975": 109.0,
+            "976": 113.0,
+            "977": 85.0,
+            "978": 104.0,
+            "979": 109.0,
+            "980": 100.0,
+            "981": 94.0,
+            "982": 105.0,
+            "983": 84.0,
+            "984": 112.0,
+            "985": 108.0,
+            "986": 92.0,
+            "987": 88.0,
+            "988": 123.0,
+            "989": 106.0,
+            "990": 103.0,
+            "991": 128.0,
+            "992": 104.0,
+            "993": 109.0,
+            "994": 98.0,
+            "995": 104.0,
+            "996": 93.0,
+            "997": 128.0,
+            "998": 121.0,
+            "999": 89.0,
+            "1000": 118.0,
+            "1001": 104.0,
+            "1002": 96.0,
+            "1003": 107.0,
+            "1004": 88.0,
+            "1005": 103.0,
+            "1006": 105.0,
+            "1007": 102.0,
+            "1008": 83.0,
+            "1009": 117.0,
+            "1010": 104.0,
+            "1011": 127.0,
+            "1012": 117.0,
+            "1013": 106.0,
+            "1014": 111.0,
+            "1015": 110.0,
+            "1016": 91.0,
+            "1017": 76.0,
+            "1018": 115.0,
+            "1019": 123.0,
+            "1020": 111.0,
+            "1021": 106.0,
+            "1022": 108.0,
+            "1023": 137.0,
+            "1024": 122.0,
+            "1025": 104.0,
+            "1026": 109.0,
+            "1027": 92.0,
+            "1028": 96.0,
+            "1029": 116.0,
+            "1030": 96.0,
+            "1031": 122.0,
+            "1032": 103.0,
+            "1033": 108.0,
+            "1034": 111.0,
+            "1035": 86.0,
+            "1036": 74.0,
+            "1037": 123.0,
+            "1038": 85.0,
+            "1039": 128.0,
+            "1040": 95.0,
+            "1041": 116.0,
+            "1042": 107.0,
+            "1043": 96.0,
+            "1044": 116.0,
+            "1045": 115.0,
+            "1046": 92.0,
+            "1047": 106.0,
+            "1048": 88.0,
+            "1049": 121.0,
+            "1050": 117.0,
+            "1051": 105.0,
+            "1052": 96.0,
+            "1053": 98.0,
+            "1054": 85.0,
+            "1055": 110.0,
+            "1056": 91.0,
+            "1057": 109.0,
+            "1058": 95.0,
+            "1059": 106.0,
+            "1060": 109.0,
+            "1061": 97.0,
+            "1062": 105.0,
+            "1063": 91.0,
+            "1064": 103.0,
+            "1065": 108.0,
+            "1066": 112.0,
+            "1067": 108.0,
+            "1068": 108.0,
+            "1069": 123.0,
+            "1070": 100.0,
+            "1071": 95.0,
+            "1072": 111.0,
+            "1073": 118.0,
+            "1074": 101.0,
+            "1075": 95.0,
+            "1076": 111.0,
+            "1077": 89.0,
+            "1078": 94.0,
+            "1079": 113.0,
+            "1080": 82.0,
+            "1081": 114.0,
+            "1082": 87.0,
+            "1083": 116.0,
+            "1084": 105.0,
+            "1085": 97.0,
+            "1086": 119.0,
+            "1087": 86.0,
+            "1088": 93.0,
+            "1089": 114.0,
+            "1090": 87.0,
+            "1091": 109.0,
+            "1092": 90.0,
+            "1093": 109.0,
+            "1094": 101.0,
+            "1095": 90.0,
+            "1096": 106.0,
+            "1097": 100.0,
+            "1098": 105.0,
+            "1099": 96.0,
+            "1100": 92.0,
+            "1101": 108.0,
+            "1102": 94.0,
+            "1103": 86.0,
+            "1104": 103.0,
+            "1105": 109.0,
+            "1106": 87.0,
+            "1107": 87.0,
+            "1108": 96.0,
+            "1109": 102.0,
+            "1110": 89.0,
+            "1111": 76.0,
+            "1112": 110.0,
+            "1113": 104.0,
+            "1114": 89.0,
+            "1115": 114.0,
+            "1116": 97.0,
+            "1117": 108.0,
+            "1118": 107.0,
+            "1119": 118.0,
+            "1120": 112.0,
+            "1121": 96.0,
+            "1122": 103.0,
+            "1123": 112.0,
+            "1124": 98.0,
+            "1125": 97.0,
+            "1126": 121.0,
+            "1127": 80.0,
+            "1128": 91.0,
+            "1129": 106.0,
+            "1130": 96.0,
+            "1131": 82.0,
+            "1132": 103.0,
+            "1133": 86.0,
+            "1134": 92.0,
+            "1135": 98.0,
+            "1136": 90.0,
+            "1137": 120.0,
+            "1138": 102.0,
+            "1139": 109.0,
+            "1140": 88.0,
+            "1141": 90.0,
+            "1142": 95.0,
+            "1143": 88.0,
+            "1144": 77.0,
+            "1145": 92.0,
+            "1146": 85.0,
+            "1147": 108.0,
+            "1148": 77.0,
+            "1149": 93.0,
+            "1150": 101.0,
+            "1151": 116.0,
+            "1152": 72.0,
+            "1153": 90.0,
+            "1154": 103.0,
+            "1155": 106.0,
+            "1156": 91.0,
+            "1157": 100.0,
+            "1158": 101.0,
+            "1159": 111.0,
+            "1160": 114.0,
+            "1161": 90.0,
+            "1162": 92.0,
+            "1163": 90.0,
+            "1164": 96.0,
+            "1165": 100.0,
+            "1166": 114.0,
+            "1167": 82.0,
+            "1168": 96.0,
+            "1169": 77.0,
+            "1170": 91.0,
+            "1171": 94.0,
+            "1172": 99.0,
+            "1173": 124.0,
+            "1174": 106.0,
+            "1175": 97.0,
+            "1176": 102.0,
+            "1177": 78.0,
+            "1178": 108.0,
+            "1179": 103.0,
+            "1180": 84.0,
+            "1181": 76.0,
+            "1182": 115.0,
+            "1183": 104.0,
+            "1184": 122.0,
+            "1185": 104.0,
+            "1186": 104.0,
+            "1187": 91.0,
+            "1188": 112.0,
+            "1189": 101.0,
+            "1190": 106.0,
+            "1191": 97.0,
+            "1192": 90.0,
+            "1193": 105.0,
+            "1194": 99.0,
+            "1195": 118.0,
+            "1196": 120.0,
+            "1197": 93.0,
+            "1198": 101.0,
+            "1199": 103.0,
+            "1200": 90.0,
+            "1201": 108.0,
+            "1202": 120.0,
+            "1203": 90.0,
+            "1204": 98.0,
+            "1205": 113.0,
+            "1206": 102.0,
+            "1207": 116.0,
+            "1208": 104.0,
+            "1209": 85.0,
+            "1210": 101.0,
+            "1211": 87.0,
+            "1212": 100.0,
+            "1213": 109.0,
+            "1214": 92.0,
+            "1215": 103.0,
+            "1216": 117.0,
+            "1217": 102.0,
+            "1218": 135.0,
+            "1219": 95.0,
+            "1220": 122.0,
+            "1221": 121.0,
+            "1222": 109.0,
+            "1223": 103.0,
+            "1224": 93.0,
+            "1225": 107.0,
+            "1226": 82.0,
+            "1227": 108.0,
+            "1228": 106.0,
+            "1229": 87.0,
+            "1230": 97.0,
+            "1231": 109.0,
+            "1232": 95.0,
+            "1233": 99.0,
+            "1234": 107.0,
+            "1235": 105.0,
+            "1236": 101.0,
+            "1237": 110.0,
+            "1238": 102.0,
+            "1239": 118.0,
+            "1240": 114.0,
+            "1241": 119.0,
+            "1242": 90.0,
+            "1243": 104.0,
+            "1244": 102.0,
+            "1245": 105.0,
+            "1246": 104.0,
+            "1247": 121.0,
+            "1248": 104.0,
+            "1249": 129.0,
+            "1250": 111.0,
+            "1251": 91.0,
+            "1252": 120.0,
+            "1253": 121.0,
+            "1254": 110.0,
+            "1255": 113.0,
+            "1256": 97.0,
+            "1257": 114.0,
+            "1258": 110.0,
+            "1259": 106.0,
+            "1260": 93.0,
+            "1261": 104.0,
+            "1262": 109.0,
+            "1263": 104.0,
+            "1264": 101.0,
+            "1265": 85.0,
+            "1266": 106.0,
+            "1267": 104.0,
+            "1268": 90.0,
+            "1269": 102.0,
+            "1270": 106.0,
+            "1271": 107.0,
+            "1272": 79.0,
+            "1273": 85.0,
+            "1274": 99.0,
+            "1275": 127.0,
+            "1276": 89.0,
+            "1277": 144.0,
+            "1278": 109.0,
+            "1279": 110.0,
+            "1280": 123.0,
+            "1281": 98.0,
+            "1282": 94.0,
+            "1283": 110.0,
+            "1284": 88.0,
+            "1285": 112.0,
+            "1286": 106.0,
+            "1287": 86.0,
+            "1288": 100.0,
+            "1289": 118.0,
+            "1290": 109.0,
+            "1291": 82.0,
+            "1292": 106.0,
+            "1293": 97.0,
+            "1294": 96.0,
+            "1295": 91.0,
+            "1296": 110.0,
+            "1297": 120.0,
+            "1298": 105.0,
+            "1299": 114.0,
+            "1300": 113.0,
+            "1301": 106.0,
+            "1302": 112.0,
+            "1303": 102.0,
+            "1304": 94.0,
+            "1305": 109.0,
+            "1306": 83.0,
+            "1307": 97.0,
+            "1308": 120.0,
+            "1309": 126.0,
+            "1310": 103.0,
+            "1311": 126.0,
+            "1312": 100.0,
+            "1313": 101.0,
+            "1314": 107.0,
+            "1315": 117.0,
+            "1316": 101.0,
+            "1317": 107.0,
+            "1318": 103.0,
+            "1319": 98.0,
+            "1320": 103.0,
+            "1321": 112.0,
+            "1322": 86.0,
+            "1323": 117.0,
+            "1324": 94.0,
+            "1325": 94.0,
+            "1326": 139.0,
+            "1327": 82.0,
+            "1328": 124.0,
+            "1329": 103.0,
+            "1330": 91.0,
+            "1331": 94.0,
+            "1332": 106.0,
+            "1333": 86.0,
+            "1334": 86.0,
+            "1335": 96.0,
+            "1336": 113.0,
+            "1337": 114.0,
+            "1338": 126.0,
+            "1339": 104.0,
+            "1340": 101.0,
+            "1341": 83.0,
+            "1342": 106.0,
+            "1343": 122.0,
+            "1344": 99.0,
+            "1345": 93.0,
+            "1346": 110.0,
+            "1347": 105.0,
+            "1348": 104.0,
+            "1349": 103.0,
+            "1350": 111.0,
+            "1351": 121.0,
+            "1352": 106.0,
+            "1353": 108.0,
+            "1354": 108.0,
+            "1355": 92.0,
+            "1356": 89.0,
+            "1357": 103.0,
+            "1358": 120.0,
+            "1359": 110.0,
+            "1360": 125.0,
+            "1361": 116.0,
+            "1362": 133.0,
+            "1363": 103.0,
+            "1364": 109.0,
+            "1365": 101.0,
+            "1366": 100.0,
+            "1367": 93.0,
+            "1368": 108.0,
+            "1369": 127.0,
+            "1370": 99.0,
+            "1371": 121.0,
+            "1372": 116.0,
+            "1373": 110.0,
+            "1374": 94.0,
+            "1375": 107.0,
+            "1376": 104.0,
+            "1377": 115.0,
+            "1378": 100.0,
+            "1379": 106.0,
+            "1380": 88.0,
+            "1381": 103.0,
+            "1382": 101.0,
+            "1383": 118.0,
+            "1384": 120.0,
+            "1385": 117.0,
+            "1386": 123.0,
+            "1387": 93.0,
+            "1388": 86.0,
+            "1389": 119.0,
+            "1390": 116.0,
+            "1391": 103.0,
+            "1392": 84.0,
+            "1393": 100.0,
+            "1394": 112.0,
+            "1395": 77.0,
+            "1396": 101.0,
+            "1397": 124.0,
+            "1398": 104.0,
+            "1399": 120.0,
+            "1400": 103.0,
+            "1401": 100.0,
+            "1402": 105.0,
+            "1403": 82.0,
+            "1404": 104.0,
+            "1405": 93.0,
+            "1406": 102.0,
+            "1407": 118.0,
+            "1408": 100.0,
+            "1409": 114.0,
+            "1410": 85.0,
+            "1411": 101.0,
+            "1412": 99.0,
+            "1413": 117.0,
+            "1414": 116.0,
+            "1415": 115.0,
+            "1416": 90.0,
+            "1417": 99.0,
+            "1418": 97.0,
+            "1419": 96.0,
+            "1420": 119.0,
+            "1421": 108.0,
+            "1422": 113.0,
+            "1423": 91.0,
+            "1424": 123.0,
+            "1425": 101.0,
+            "1426": 110.0,
+            "1427": 107.0,
+            "1428": 116.0,
+            "1429": 128.0,
+            "1430": 87.0,
+            "1431": 96.0,
+            "1432": 113.0,
+            "1433": 92.0,
+            "1434": 101.0,
+            "1435": 101.0,
+            "1436": 111.0,
+            "1437": 122.0,
+            "1438": 105.0,
+            "1439": 99.0,
+            "1440": 101.0,
+            "1441": 104.0,
+            "1442": 89.0,
+            "1443": 109.0,
+            "1444": 86.0,
+            "1445": 100.0,
+            "1446": 87.0,
+            "1447": 105.0,
+            "1448": 102.0,
+            "1449": 88.0,
+            "1450": 100.0,
+            "1451": 94.0,
+            "1452": 95.0,
+            "1453": 116.0,
+            "1454": 98.0,
+            "1455": 92.0,
+            "1456": 91.0,
+            "1457": 132.0,
+            "1458": 121.0,
+            "1459": 109.0,
+            "1460": 111.0,
+            "1461": 111.0,
+            "1462": 89.0,
+            "1463": 99.0,
+            "1464": 108.0,
+            "1465": 97.0,
+            "1466": 87.0,
+            "1467": 99.0,
+            "1468": 127.0,
+            "1469": 88.0,
+            "1470": 103.0,
+            "1471": 101.0,
+            "1472": 106.0,
+            "1473": 120.0,
+            "1474": 96.0,
+            "1475": 123.0,
+            "1476": 85.0,
+            "1477": 122.0,
+            "1478": 107.0,
+            "1479": 113.0,
+            "1480": 109.0,
+            "1481": 107.0,
+            "1482": 118.0,
+            "1483": 86.0,
+            "1484": 98.0,
+            "1485": 91.0,
+            "1486": 96.0,
+            "1487": 119.0,
+            "1488": 106.0,
+            "1489": 93.0,
+            "1490": 113.0,
+            "1491": 107.0,
+            "1492": 100.0,
+            "1493": 123.0,
+            "1494": 105.0,
+            "1495": 121.0,
+            "1496": 105.0,
+            "1497": 99.0,
+            "1498": 112.0,
+            "1499": 106.0,
+            "1500": 104.0,
+            "1501": 129.0,
+            "1502": 109.0,
+            "1503": 91.0,
+            "1504": 111.0,
+            "1505": 97.0,
+            "1506": 116.0,
+            "1507": 122.0,
+            "1508": 103.0,
+            "1509": 141.0,
+            "1510": 86.0,
+            "1511": 120.0,
+            "1512": 120.0,
+            "1513": 128.0,
+            "1514": 100.0,
+            "1515": 108.0,
+            "1516": 99.0,
+            "1517": 109.0,
+            "1518": 106.0,
+            "1519": 88.0,
+            "1520": 89.0,
+            "1521": 101.0,
+            "1522": 112.0,
+            "1523": 88.0,
+            "1524": 113.0,
+            "1525": 94.0,
+            "1526": 110.0,
+            "1527": 112.0,
+            "1528": 84.0,
+            "1529": 91.0,
+            "1530": 114.0,
+            "1531": 113.0,
+            "1532": 119.0,
+            "1533": 95.0,
+            "1534": 112.0,
+            "1535": 112.0,
+            "1536": 109.0,
+            "1537": 97.0,
+            "1538": 111.0,
+            "1539": 115.0,
+            "1540": 114.0,
+            "1541": 88.0,
+            "1542": 126.0,
+            "1543": 97.0,
+            "1544": 84.0,
+            "1545": 105.0,
+            "1546": 82.0,
+            "1547": 93.0,
+            "1548": 90.0,
+            "1549": 99.0,
+            "1550": 93.0,
+            "1551": 98.0,
+            "1552": 86.0,
+            "1553": 120.0,
+            "1554": 109.0,
+            "1555": 111.0,
+            "1556": 98.0,
+            "1557": 90.0,
+            "1558": 120.0,
+            "1559": 84.0,
+            "1560": 107.0,
+            "1561": 103.0,
+            "1562": 121.0,
+            "1563": 116.0,
+            "1564": 113.0,
+            "1565": 114.0,
+            "1566": 113.0,
+            "1567": 102.0,
+            "1568": 91.0,
+            "1569": 122.0,
+            "1570": 95.0,
+            "1571": 115.0,
+            "1572": 102.0,
+            "1573": 100.0,
+            "1574": 121.0,
+            "1575": 108.0,
+            "1576": 88.0,
+            "1577": 116.0,
+            "1578": 101.0,
+            "1579": 98.0,
+            "1580": 114.0,
+            "1581": 102.0,
+            "1582": 108.0,
+            "1583": 115.0,
+            "1584": 70.0,
+            "1585": 112.0,
+            "1586": 120.0,
+            "1587": 101.0,
+            "1588": 118.0,
+            "1589": 99.0,
+            "1590": 103.0,
+            "1591": 108.0,
+            "1592": 106.0,
+            "1593": 121.0,
+            "1594": 110.0,
+            "1595": 103.0,
+            "1596": 117.0,
+            "1597": 115.0,
+            "1598": 105.0,
+            "1599": 76.0,
+            "1600": 90.0,
+            "1601": 108.0,
+            "1602": 105.0,
+            "1603": 122.0,
+            "1604": 113.0,
+            "1605": 122.0,
+            "1606": 117.0,
+            "1607": 92.0,
+            "1608": 118.0,
+            "1609": 115.0,
+            "1610": 103.0,
+            "1611": 117.0,
+            "1612": 106.0,
+            "1613": 106.0,
+            "1614": 104.0,
+            "1615": 114.0,
+            "1616": 88.0,
+            "1617": 97.0,
+            "1618": 111.0,
+            "1619": 107.0,
+            "1620": 112.0,
+            "1621": 91.0,
+            "1622": 130.0,
+            "1623": 109.0,
+            "1624": 102.0,
+            "1625": 121.0,
+            "1626": 100.0,
+            "1627": 119.0,
+            "1628": 99.0,
+            "1629": 119.0,
+            "1630": 117.0,
+            "1631": 105.0,
+            "1632": 116.0,
+            "1633": 112.0,
+            "1634": 120.0,
+            "1635": 99.0,
+            "1636": 105.0,
+            "1637": 94.0,
+            "1638": 107.0,
+            "1639": 97.0,
+            "1640": 106.0,
+            "1641": 120.0,
+            "1642": 101.0,
+            "1643": 135.0,
+            "1644": 117.0,
+            "1645": 110.0,
+            "1646": 106.0,
+            "1647": 127.0,
+            "1648": 82.0,
+            "1649": 114.0,
+            "1650": 121.0,
+            "1651": 107.0,
+            "1652": 100.0,
+            "1653": 108.0,
+            "1654": 114.0,
+            "1655": 92.0,
+            "1656": 80.0,
+            "1657": 110.0,
+            "1658": 114.0,
+            "1659": 105.0,
+            "1660": 104.0,
+            "1661": 102.0,
+            "1662": 124.0,
+            "1663": 96.0,
+            "1664": 127.0,
+            "1665": 89.0,
+            "1666": 115.0,
+            "1667": 114.0,
+            "1668": 122.0,
+            "1669": 94.0,
+            "1670": 114.0,
+            "1671": 102.0,
+            "1672": 99.0,
+            "1673": 109.0,
+            "1674": 117.0,
+            "1675": 105.0,
+            "1676": 116.0,
+            "1677": 101.0,
+            "1678": 110.0,
+            "1679": 112.0,
+            "1680": 96.0,
+            "1681": 93.0,
+            "1682": 97.0,
+            "1683": 106.0,
+            "1684": 103.0,
+            "1685": 101.0,
+            "1686": 109.0,
+            "1687": 104.0,
+            "1688": 127.0,
+            "1689": 88.0,
+            "1690": 98.0,
+            "1691": 90.0,
+            "1692": 107.0,
+            "1693": 111.0,
+            "1694": 125.0,
+            "1695": 129.0,
+            "1696": 112.0,
+            "1697": 126.0,
+            "1698": 104.0,
+            "1699": 124.0,
+            "1700": 112.0,
+            "1701": 120.0,
+            "1702": 89.0,
+            "1703": 103.0,
+            "1704": 103.0,
+            "1705": 111.0,
+            "1706": 124.0,
+            "1707": 93.0,
+            "1708": 96.0,
+            "1709": 116.0,
+            "1710": 133.0,
+            "1711": 107.0,
+            "1712": 100.0,
+            "1713": 91.0,
+            "1714": 122.0,
+            "1715": 108.0,
+            "1716": 110.0,
+            "1717": 121.0,
+            "1718": 101.0,
+            "1719": 110.0,
+            "1720": 121.0,
+            "1721": 109.0,
+            "1722": 96.0,
+            "1723": 125.0,
+            "1724": 118.0,
+            "1725": 122.0,
+            "1726": 113.0,
+            "1727": 99.0,
+            "1728": 98.0,
+            "1729": 115.0,
+            "1730": 106.0,
+            "1731": 96.0,
+            "1732": 95.0,
+            "1733": 115.0,
+            "1734": 106.0,
+            "1735": 102.0,
+            "1736": 104.0,
+            "1737": 122.0,
+            "1738": 94.0,
+            "1739": 92.0,
+            "1740": 105.0,
+            "1741": 113.0,
+            "1742": 129.0,
+            "1743": 113.0,
+            "1744": 110.0,
+            "1745": 113.0,
+            "1746": 127.0,
+            "1747": 108.0,
+            "1748": 120.0,
+            "1749": 115.0,
+            "1750": 104.0,
+            "1751": 114.0,
+            "1752": 122.0,
+            "1753": 113.0,
+            "1754": 123.0,
+            "1755": 114.0,
+            "1756": 115.0,
+            "1757": 126.0,
+            "1758": 105.0,
+            "1759": 109.0,
+            "1760": 136.0,
+            "1761": 111.0,
+            "1762": 104.0,
+            "1763": 104.0,
+            "1764": 105.0,
+            "1765": 133.0,
+            "1766": 118.0,
+            "1767": 108.0,
+            "1768": 114.0,
+            "1769": 105.0,
+            "1770": 98.0,
+            "1771": 112.0,
+            "1772": 92.0,
+            "1773": 77.0,
+            "1774": 130.0,
+            "1775": 104.0,
+            "1776": 85.0,
+            "1777": 106.0,
+            "1778": 84.0,
+            "1779": 111.0,
+            "1780": 109.0,
+            "1781": 124.0,
+            "1782": 109.0,
+            "1783": 128.0,
+            "1784": 117.0,
+            "1785": 118.0,
+            "1786": 111.0,
+            "1787": 112.0,
+            "1788": 104.0,
+            "1789": 135.0,
+            "1790": 105.0,
+            "1791": 115.0,
+            "1792": 130.0,
+            "1793": 119.0,
+            "1794": 128.0,
+            "1795": 110.0,
+            "1796": 130.0,
+            "1797": 97.0,
+            "1798": 139.0,
+            "1799": 104.0,
+            "1800": 103.0,
+            "1801": 94.0,
+            "1802": 134.0,
+            "1803": 117.0,
+            "1804": 139.0,
+            "1805": 124.0,
+            "1806": 127.0,
+            "1807": 128.0,
+            "1808": 99.0,
+            "1809": 92.0,
+            "1810": 116.0,
+            "1811": 104.0,
+            "1812": 103.0,
+            "1813": 122.0,
+            "1814": 129.0,
+            "1815": 94.0,
+            "1816": 104.0,
+            "1817": 98.0,
+            "1818": 128.0,
+            "1819": 112.0,
+            "1820": 99.0,
+            "1821": 126.0,
+            "1822": 83.0,
+            "1823": 117.0,
+            "1824": 96.0,
+            "1825": 95.0,
+            "1826": 127.0,
+            "1827": 124.0,
+            "1828": 120.0,
+            "1829": 110.0,
+            "1830": 123.0,
+            "1831": 110.0,
+            "1832": 92.0,
+            "1833": 100.0,
+            "1834": 113.0,
+            "1835": 120.0,
+            "1836": 113.0,
+            "1837": 114.0,
+            "1838": 99.0,
+            "1839": 123.0,
+            "1840": 109.0,
+            "1841": 95.0,
+            "1842": 101.0,
+            "1843": 122.0,
+            "1844": 113.0,
+            "1845": 127.0,
+            "1846": 100.0,
+            "1847": 117.0,
+            "1848": 133.0,
+            "1849": 87.0,
+            "1850": 103.0,
+            "1851": 89.0,
+            "1852": 99.0,
+            "1853": 93.0,
+            "1854": 99.0,
+            "1855": 107.0,
+            "1856": 111.0,
+            "1857": 121.0,
+            "1858": 92.0,
+            "1859": 105.0,
+            "1860": 115.0,
+            "1861": 92.0,
+            "1862": 91.0,
+            "1863": 112.0,
+            "1864": 109.0,
+            "1865": 125.0,
+            "1866": 124.0,
+            "1867": 110.0,
+            "1868": 113.0,
+            "1869": 119.0,
+            "1870": 137.0,
+            "1871": 126.0,
+            "1872": 95.0,
+            "1873": 119.0,
+            "1874": 105.0,
+            "1875": 128.0,
+            "1876": 104.0,
+            "1877": 120.0,
+            "1878": 95.0,
+            "1879": 99.0,
+            "1880": 123.0,
+            "1881": 99.0,
+            "1882": 97.0,
+            "1883": 101.0,
+            "1884": 115.0,
+            "1885": 106.0,
+            "1886": 123.0,
+            "1887": 121.0,
+            "1888": 121.0,
+            "1889": 114.0,
+            "1890": 100.0,
+            "1891": 110.0,
+            "1892": 107.0,
+            "1893": 113.0,
+            "1894": 134.0,
+            "1895": 114.0,
+            "1896": 111.0,
+            "1897": 122.0,
+            "1898": 108.0,
+            "1899": 94.0,
+            "1900": 123.0,
+            "1901": 125.0,
+            "1902": 115.0,
+            "1903": 112.0,
+            "1904": 113.0,
+            "1905": 109.0,
+            "1906": 115.0,
+            "1907": 95.0,
+            "1908": 113.0,
+            "1909": 79.0,
+            "1910": 97.0,
+            "1911": 135.0,
+            "1912": 122.0,
+            "1913": 105.0,
+            "1914": 112.0,
+            "1915": 129.0,
+            "1916": 117.0,
+            "1917": 115.0,
+            "1918": 113.0,
+            "1919": 117.0,
+            "1920": 122.0,
+            "1921": 105.0,
+            "1922": 86.0,
+            "1923": 113.0,
+            "1924": 111.0,
+            "1925": 110.0,
+            "1926": 112.0,
+            "1927": 103.0,
+            "1928": 108.0,
+            "1929": 113.0,
+            "1930": 121.0,
+            "1931": 111.0,
+            "1932": 106.0,
+            "1933": 114.0,
+            "1934": 117.0,
+            "1935": 93.0,
+            "1936": 109.0,
+            "1937": 121.0,
+            "1938": 108.0,
+            "1939": 132.0,
+            "1940": 127.0,
+            "1941": 126.0,
+            "1942": 101.0,
+            "1943": 120.0,
+            "1944": 87.0,
+            "1945": 114.0,
+            "1946": 105.0,
+            "1947": 109.0,
+            "1948": 109.0,
+            "1949": 106.0,
+            "1950": 111.0,
+            "1951": 120.0,
+            "1952": 104.0,
+            "1953": 113.0,
+            "1954": 116.0,
+            "1955": 131.0,
+            "1956": 91.0,
+            "1957": 118.0,
+            "1958": 139.0,
+            "1959": 114.0,
+            "1960": 96.0,
+            "1961": 109.0,
+            "1962": 113.0,
+            "1963": 125.0,
+            "1964": 112.0,
+            "1965": 108.0,
+            "1966": 130.0,
+            "1967": 120.0,
+            "1968": 110.0,
+            "1969": 96.0,
+            "1970": 110.0,
+            "1971": 121.0,
+            "1972": 104.0,
+            "1973": 103.0,
+            "1974": 110.0,
+            "1975": 101.0,
+            "1976": 144.0,
+            "1977": 122.0,
+            "1978": 118.0,
+            "1979": 121.0,
+            "1980": 115.0,
+            "1981": 114.0,
+            "1982": 136.0,
+            "1983": 123.0,
+            "1984": 112.0,
+            "1985": 116.0,
+            "1986": 104.0,
+            "1987": 133.0,
+            "1988": 107.0,
+            "1989": 100.0,
+            "1990": 112.0,
+            "1991": 119.0,
+            "1992": 103.0,
+            "1993": 133.0,
+            "1994": 123.0,
+            "1995": 118.0,
+            "1996": 109.0,
+            "1997": 119.0,
+            "1998": 107.0,
+            "1999": 119.0,
+            "2000": 134.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 442918400.0,
+            "2": 442918400.0,
+            "3": 442918400.0,
+            "4": 442918400.0,
+            "5": 442918400.0,
+            "6": 442918400.0,
+            "7": 442918400.0,
+            "8": 442918400.0,
+            "9": 442918400.0,
+            "10": 442918400.0,
+            "11": 442918400.0,
+            "12": 442918400.0,
+            "13": 442918400.0,
+            "14": 442918400.0,
+            "15": 442918400.0,
+            "16": 442918400.0,
+            "17": 442918400.0,
+            "18": 442918400.0,
+            "19": 442918400.0,
+            "20": 442918400.0,
+            "21": 442918400.0,
+            "22": 442918400.0,
+            "23": 442918400.0,
+            "24": 442918400.0,
+            "25": 442918400.0,
+            "26": 442918400.0,
+            "27": 442918400.0,
+            "28": 442918400.0,
+            "29": 442918400.0,
+            "30": 442918400.0,
+            "31": 442918400.0,
+            "32": 442918400.0,
+            "33": 442918400.0,
+            "34": 442918400.0,
+            "35": 442918400.0,
+            "36": 442918400.0,
+            "37": 442918400.0,
+            "38": 442918400.0,
+            "39": 442918400.0,
+            "40": 442918400.0,
+            "41": 442918400.0,
+            "42": 442918400.0,
+            "43": 442918400.0,
+            "44": 442918400.0,
+            "45": 442918400.0,
+            "46": 442918400.0,
+            "47": 442918400.0,
+            "48": 442918400.0,
+            "49": 442918400.0,
+            "50": 442918400.0,
+            "51": 442918400.0,
+            "52": 442918400.0,
+            "53": 442918400.0,
+            "54": 442918400.0,
+            "55": 442918400.0,
+            "56": 442918400.0,
+            "57": 442918400.0,
+            "58": 442918400.0,
+            "59": 442918400.0,
+            "60": 442918400.0,
+            "61": 442918400.0,
+            "62": 442918400.0,
+            "63": 442918400.0,
+            "64": 442918400.0,
+            "65": 442918400.0,
+            "66": 442918400.0,
+            "67": 442918400.0,
+            "68": 442918400.0,
+            "69": 442918400.0,
+            "70": 442918400.0,
+            "71": 442918400.0,
+            "72": 442918400.0,
+            "73": 442918400.0,
+            "74": 442918400.0,
+            "75": 442918400.0,
+            "76": 442918400.0,
+            "77": 442918400.0,
+            "78": 442918400.0,
+            "79": 442918400.0,
+            "80": 442918400.0,
+            "81": 442918400.0,
+            "82": 442918400.0,
+            "83": 442918400.0,
+            "84": 442918400.0,
+            "85": 442918400.0,
+            "86": 442918400.0,
+            "87": 442918400.0,
+            "88": 442918400.0,
+            "89": 442918400.0,
+            "90": 442918400.0,
+            "91": 442918400.0,
+            "92": 442918400.0,
+            "93": 442918400.0,
+            "94": 442918400.0,
+            "95": 442918400.0,
+            "96": 442918400.0,
+            "97": 442918400.0,
+            "98": 442918400.0,
+            "99": 442918400.0,
+            "100": 442918400.0,
+            "101": 442918400.0,
+            "102": 442918400.0,
+            "103": 442918400.0,
+            "104": 442918400.0,
+            "105": 442918400.0,
+            "106": 442918400.0,
+            "107": 442918400.0,
+            "108": 442918400.0,
+            "109": 442918400.0,
+            "110": 442918400.0,
+            "111": 442918400.0,
+            "112": 442918400.0,
+            "113": 442918400.0,
+            "114": 442918400.0,
+            "115": 442918400.0,
+            "116": 442918400.0,
+            "117": 442918400.0,
+            "118": 442918400.0,
+            "119": 442918400.0,
+            "120": 442918400.0,
+            "121": 442918400.0,
+            "122": 442918400.0,
+            "123": 442918400.0,
+            "124": 442918400.0,
+            "125": 442918400.0,
+            "126": 442918400.0,
+            "127": 442918400.0,
+            "128": 442918400.0,
+            "129": 442918400.0,
+            "130": 442918400.0,
+            "131": 442918400.0,
+            "132": 442918400.0,
+            "133": 442918400.0,
+            "134": 442918400.0,
+            "135": 442918400.0,
+            "136": 442918400.0,
+            "137": 442918400.0,
+            "138": 442918400.0,
+            "139": 442918400.0,
+            "140": 442918400.0,
+            "141": 442918400.0,
+            "142": 442918400.0,
+            "143": 442918400.0,
+            "144": 442918400.0,
+            "145": 442918400.0,
+            "146": 442918400.0,
+            "147": 442918400.0,
+            "148": 442918400.0,
+            "149": 442918400.0,
+            "150": 442918400.0,
+            "151": 442918400.0,
+            "152": 442918400.0,
+            "153": 442918400.0,
+            "154": 442918400.0,
+            "155": 442918400.0,
+            "156": 442918400.0,
+            "157": 442918400.0,
+            "158": 442918400.0,
+            "159": 442918400.0,
+            "160": 442918400.0,
+            "161": 442918400.0,
+            "162": 442918400.0,
+            "163": 442918400.0,
+            "164": 442918400.0,
+            "165": 442918400.0,
+            "166": 442918400.0,
+            "167": 442918400.0,
+            "168": 442918400.0,
+            "169": 442918400.0,
+            "170": 442918400.0,
+            "171": 442918400.0,
+            "172": 442918400.0,
+            "173": 442918400.0,
+            "174": 442918400.0,
+            "175": 442918400.0,
+            "176": 442918400.0,
+            "177": 442918400.0,
+            "178": 442918400.0,
+            "179": 442918400.0,
+            "180": 442918400.0,
+            "181": 442918400.0,
+            "182": 442918400.0,
+            "183": 442918400.0,
+            "184": 442918400.0,
+            "185": 442918400.0,
+            "186": 442918400.0,
+            "187": 442918400.0,
+            "188": 442918400.0,
+            "189": 442918400.0,
+            "190": 442918400.0,
+            "191": 442918400.0,
+            "192": 442918400.0,
+            "193": 442918400.0,
+            "194": 442918400.0,
+            "195": 442918400.0,
+            "196": 442918400.0,
+            "197": 442918400.0,
+            "198": 442918400.0,
+            "199": 442918400.0,
+            "200": 442918400.0,
+            "201": 442918400.0,
+            "202": 442918400.0,
+            "203": 442918400.0,
+            "204": 442918400.0,
+            "205": 442918400.0,
+            "206": 442918400.0,
+            "207": 442918400.0,
+            "208": 442918400.0,
+            "209": 442918400.0,
+            "210": 442918400.0,
+            "211": 442918400.0,
+            "212": 442918400.0,
+            "213": 442918400.0,
+            "214": 442918400.0,
+            "215": 442918400.0,
+            "216": 442918400.0,
+            "217": 442918400.0,
+            "218": 442918400.0,
+            "219": 442918400.0,
+            "220": 442918400.0,
+            "221": 442918400.0,
+            "222": 442918400.0,
+            "223": 442918400.0,
+            "224": 442918400.0,
+            "225": 442918400.0,
+            "226": 442918400.0,
+            "227": 442918400.0,
+            "228": 442918400.0,
+            "229": 442918400.0,
+            "230": 442918400.0,
+            "231": 442918400.0,
+            "232": 442918400.0,
+            "233": 442918400.0,
+            "234": 442918400.0,
+            "235": 442918400.0,
+            "236": 442918400.0,
+            "237": 442918400.0,
+            "238": 442918400.0,
+            "239": 442918400.0,
+            "240": 442918400.0,
+            "241": 442918400.0,
+            "242": 442918400.0,
+            "243": 442918400.0,
+            "244": 442918400.0,
+            "245": 442918400.0,
+            "246": 442918400.0,
+            "247": 442918400.0,
+            "248": 442918400.0,
+            "249": 442918400.0,
+            "250": 442918400.0,
+            "251": 442918400.0,
+            "252": 442918400.0,
+            "253": 442918400.0,
+            "254": 442918400.0,
+            "255": 442918400.0,
+            "256": 442918400.0,
+            "257": 442918400.0,
+            "258": 442918400.0,
+            "259": 442918400.0,
+            "260": 442918400.0,
+            "261": 442918400.0,
+            "262": 442918400.0,
+            "263": 442918400.0,
+            "264": 442918400.0,
+            "265": 442918400.0,
+            "266": 442918400.0,
+            "267": 442918400.0,
+            "268": 442918400.0,
+            "269": 442918400.0,
+            "270": 442918400.0,
+            "271": 442918400.0,
+            "272": 442918400.0,
+            "273": 442918400.0,
+            "274": 442918400.0,
+            "275": 442918400.0,
+            "276": 442918400.0,
+            "277": 442918400.0,
+            "278": 442918400.0,
+            "279": 442918400.0,
+            "280": 442918400.0,
+            "281": 442918400.0,
+            "282": 442918400.0,
+            "283": 442918400.0,
+            "284": 442918400.0,
+            "285": 442918400.0,
+            "286": 442918400.0,
+            "287": 442918400.0,
+            "288": 442918400.0,
+            "289": 442918400.0,
+            "290": 442918400.0,
+            "291": 442918400.0,
+            "292": 442918400.0,
+            "293": 442918400.0,
+            "294": 442918400.0,
+            "295": 442918400.0,
+            "296": 442918400.0,
+            "297": 442918400.0,
+            "298": 442918400.0,
+            "299": 442918400.0,
+            "300": 442918400.0,
+            "301": 442918400.0,
+            "302": 442918400.0,
+            "303": 442918400.0,
+            "304": 442918400.0,
+            "305": 442918400.0,
+            "306": 442918400.0,
+            "307": 442918400.0,
+            "308": 442918400.0,
+            "309": 442918400.0,
+            "310": 442918400.0,
+            "311": 442918400.0,
+            "312": 442918400.0,
+            "313": 442918400.0,
+            "314": 442918400.0,
+            "315": 442918400.0,
+            "316": 442918400.0,
+            "317": 442918400.0,
+            "318": 442918400.0,
+            "319": 442918400.0,
+            "320": 442918400.0,
+            "321": 442918400.0,
+            "322": 442918400.0,
+            "323": 442918400.0,
+            "324": 442918400.0,
+            "325": 442918400.0,
+            "326": 442918400.0,
+            "327": 442918400.0,
+            "328": 442918400.0,
+            "329": 442918400.0,
+            "330": 442918400.0,
+            "331": 442918400.0,
+            "332": 442918400.0,
+            "333": 442918400.0,
+            "334": 442918400.0,
+            "335": 442918400.0,
+            "336": 442918400.0,
+            "337": 442918400.0,
+            "338": 442918400.0,
+            "339": 442918400.0,
+            "340": 442918400.0,
+            "341": 442918400.0,
+            "342": 442918400.0,
+            "343": 442918400.0,
+            "344": 442918400.0,
+            "345": 442918400.0,
+            "346": 442918400.0,
+            "347": 442918400.0,
+            "348": 442918400.0,
+            "349": 442918400.0,
+            "350": 442918400.0,
+            "351": 442918400.0,
+            "352": 442918400.0,
+            "353": 442918400.0,
+            "354": 442918400.0,
+            "355": 442918400.0,
+            "356": 442918400.0,
+            "357": 442918400.0,
+            "358": 442918400.0,
+            "359": 442918400.0,
+            "360": 442918400.0,
+            "361": 442918400.0,
+            "362": 442918400.0,
+            "363": 442918400.0,
+            "364": 442918400.0,
+            "365": 442918400.0,
+            "366": 442918400.0,
+            "367": 442918400.0,
+            "368": 442918400.0,
+            "369": 442918400.0,
+            "370": 442918400.0,
+            "371": 442918400.0,
+            "372": 442918400.0,
+            "373": 442918400.0,
+            "374": 442918400.0,
+            "375": 442918400.0,
+            "376": 442918400.0,
+            "377": 442918400.0,
+            "378": 442918400.0,
+            "379": 442918400.0,
+            "380": 442918400.0,
+            "381": 442918400.0,
+            "382": 442918400.0,
+            "383": 442918400.0,
+            "384": 442918400.0,
+            "385": 442918400.0,
+            "386": 442918400.0,
+            "387": 442918400.0,
+            "388": 442918400.0,
+            "389": 442918400.0,
+            "390": 442918400.0,
+            "391": 442918400.0,
+            "392": 442918400.0,
+            "393": 442918400.0,
+            "394": 442918400.0,
+            "395": 442918400.0,
+            "396": 442918400.0,
+            "397": 442918400.0,
+            "398": 442918400.0,
+            "399": 442918400.0,
+            "400": 442918400.0,
+            "401": 442918400.0,
+            "402": 442918400.0,
+            "403": 442918400.0,
+            "404": 442918400.0,
+            "405": 442918400.0,
+            "406": 442918400.0,
+            "407": 442918400.0,
+            "408": 442918400.0,
+            "409": 442918400.0,
+            "410": 442918400.0,
+            "411": 442918400.0,
+            "412": 442918400.0,
+            "413": 442918400.0,
+            "414": 442918400.0,
+            "415": 442918400.0,
+            "416": 442918400.0,
+            "417": 442918400.0,
+            "418": 442918400.0,
+            "419": 442918400.0,
+            "420": 442918400.0,
+            "421": 442918400.0,
+            "422": 442918400.0,
+            "423": 442918400.0,
+            "424": 442918400.0,
+            "425": 442918400.0,
+            "426": 442918400.0,
+            "427": 442918400.0,
+            "428": 442918400.0,
+            "429": 442918400.0,
+            "430": 442918400.0,
+            "431": 442918400.0,
+            "432": 442918400.0,
+            "433": 442918400.0,
+            "434": 442918400.0,
+            "435": 442918400.0,
+            "436": 442918400.0,
+            "437": 442918400.0,
+            "438": 442918400.0,
+            "439": 442918400.0,
+            "440": 442918400.0,
+            "441": 442918400.0,
+            "442": 442918400.0,
+            "443": 442918400.0,
+            "444": 442918400.0,
+            "445": 442918400.0,
+            "446": 442918400.0,
+            "447": 442918400.0,
+            "448": 442918400.0,
+            "449": 442918400.0,
+            "450": 442918400.0,
+            "451": 442918400.0,
+            "452": 442918400.0,
+            "453": 442918400.0,
+            "454": 442918400.0,
+            "455": 442918400.0,
+            "456": 442918400.0,
+            "457": 442918400.0,
+            "458": 442918400.0,
+            "459": 442918400.0,
+            "460": 442918400.0,
+            "461": 442918400.0,
+            "462": 442918400.0,
+            "463": 442918400.0,
+            "464": 442918400.0,
+            "465": 442918400.0,
+            "466": 442918400.0,
+            "467": 442918400.0,
+            "468": 442918400.0,
+            "469": 442918400.0,
+            "470": 442918400.0,
+            "471": 442918400.0,
+            "472": 442918400.0,
+            "473": 442918400.0,
+            "474": 442918400.0,
+            "475": 442918400.0,
+            "476": 442918400.0,
+            "477": 442918400.0,
+            "478": 442918400.0,
+            "479": 442918400.0,
+            "480": 442918400.0,
+            "481": 442918400.0,
+            "482": 442918400.0,
+            "483": 442918400.0,
+            "484": 442918400.0,
+            "485": 442918400.0,
+            "486": 442918400.0,
+            "487": 442918400.0,
+            "488": 442918400.0,
+            "489": 442918400.0,
+            "490": 442918400.0,
+            "491": 442918400.0,
+            "492": 442918400.0,
+            "493": 442918400.0,
+            "494": 442918400.0,
+            "495": 442918400.0,
+            "496": 442918400.0,
+            "497": 442918400.0,
+            "498": 442918400.0,
+            "499": 442918400.0,
+            "500": 442918400.0,
+            "501": 442918400.0,
+            "502": 442918400.0,
+            "503": 442918400.0,
+            "504": 442918400.0,
+            "505": 442918400.0,
+            "506": 442918400.0,
+            "507": 442918400.0,
+            "508": 442918400.0,
+            "509": 442918400.0,
+            "510": 442918400.0,
+            "511": 442918400.0,
+            "512": 442918400.0,
+            "513": 442918400.0,
+            "514": 442918400.0,
+            "515": 442918400.0,
+            "516": 442918400.0,
+            "517": 442918400.0,
+            "518": 442918400.0,
+            "519": 442918400.0,
+            "520": 442918400.0,
+            "521": 442918400.0,
+            "522": 442918400.0,
+            "523": 442918400.0,
+            "524": 442918400.0,
+            "525": 442918400.0,
+            "526": 442918400.0,
+            "527": 442918400.0,
+            "528": 442918400.0,
+            "529": 442918400.0,
+            "530": 442918400.0,
+            "531": 442918400.0,
+            "532": 442918400.0,
+            "533": 442918400.0,
+            "534": 442918400.0,
+            "535": 442918400.0,
+            "536": 442918400.0,
+            "537": 442918400.0,
+            "538": 442918400.0,
+            "539": 442918400.0,
+            "540": 442918400.0,
+            "541": 442918400.0,
+            "542": 442918400.0,
+            "543": 442918400.0,
+            "544": 442918400.0,
+            "545": 442918400.0,
+            "546": 442918400.0,
+            "547": 442918400.0,
+            "548": 442918400.0,
+            "549": 442918400.0,
+            "550": 442918400.0,
+            "551": 442918400.0,
+            "552": 442918400.0,
+            "553": 442918400.0,
+            "554": 442918400.0,
+            "555": 442918400.0,
+            "556": 442918400.0,
+            "557": 442918400.0,
+            "558": 442918400.0,
+            "559": 442918400.0,
+            "560": 442918400.0,
+            "561": 442918400.0,
+            "562": 442918400.0,
+            "563": 442918400.0,
+            "564": 442918400.0,
+            "565": 442918400.0,
+            "566": 442918400.0,
+            "567": 442918400.0,
+            "568": 442918400.0,
+            "569": 442918400.0,
+            "570": 442918400.0,
+            "571": 442918400.0,
+            "572": 442918400.0,
+            "573": 442918400.0,
+            "574": 442918400.0,
+            "575": 442918400.0,
+            "576": 442918400.0,
+            "577": 442918400.0,
+            "578": 442918400.0,
+            "579": 442918400.0,
+            "580": 442918400.0,
+            "581": 442918400.0,
+            "582": 442918400.0,
+            "583": 442918400.0,
+            "584": 442918400.0,
+            "585": 442918400.0,
+            "586": 442918400.0,
+            "587": 442918400.0,
+            "588": 442918400.0,
+            "589": 442918400.0,
+            "590": 442918400.0,
+            "591": 442918400.0,
+            "592": 442918400.0,
+            "593": 442918400.0,
+            "594": 442918400.0,
+            "595": 442918400.0,
+            "596": 442918400.0,
+            "597": 442918400.0,
+            "598": 442918400.0,
+            "599": 442918400.0,
+            "600": 442918400.0,
+            "601": 442918400.0,
+            "602": 442918400.0,
+            "603": 442918400.0,
+            "604": 442918400.0,
+            "605": 442918400.0,
+            "606": 442918400.0,
+            "607": 442918400.0,
+            "608": 442918400.0,
+            "609": 442918400.0,
+            "610": 442918400.0,
+            "611": 442918400.0,
+            "612": 442918400.0,
+            "613": 442918400.0,
+            "614": 442918400.0,
+            "615": 442918400.0,
+            "616": 442918400.0,
+            "617": 442918400.0,
+            "618": 442918400.0,
+            "619": 442918400.0,
+            "620": 442918400.0,
+            "621": 442918400.0,
+            "622": 442918400.0,
+            "623": 442918400.0,
+            "624": 442918400.0,
+            "625": 442918400.0,
+            "626": 442918400.0,
+            "627": 442918400.0,
+            "628": 442918400.0,
+            "629": 442918400.0,
+            "630": 442918400.0,
+            "631": 442918400.0,
+            "632": 442918400.0,
+            "633": 442918400.0,
+            "634": 442918400.0,
+            "635": 442918400.0,
+            "636": 442918400.0,
+            "637": 442918400.0,
+            "638": 442918400.0,
+            "639": 442918400.0,
+            "640": 442918400.0,
+            "641": 442918400.0,
+            "642": 442918400.0,
+            "643": 442918400.0,
+            "644": 442918400.0,
+            "645": 442918400.0,
+            "646": 442918400.0,
+            "647": 442918400.0,
+            "648": 442918400.0,
+            "649": 442918400.0,
+            "650": 442918400.0,
+            "651": 442918400.0,
+            "652": 442918400.0,
+            "653": 442918400.0,
+            "654": 442918400.0,
+            "655": 442918400.0,
+            "656": 442918400.0,
+            "657": 442918400.0,
+            "658": 442918400.0,
+            "659": 442918400.0,
+            "660": 442918400.0,
+            "661": 442918400.0,
+            "662": 442918400.0,
+            "663": 442918400.0,
+            "664": 442918400.0,
+            "665": 442918400.0,
+            "666": 442918400.0,
+            "667": 442918400.0,
+            "668": 442918400.0,
+            "669": 442918400.0,
+            "670": 442918400.0,
+            "671": 442918400.0,
+            "672": 442918400.0,
+            "673": 442918400.0,
+            "674": 442918400.0,
+            "675": 442918400.0,
+            "676": 442918400.0,
+            "677": 442918400.0,
+            "678": 442918400.0,
+            "679": 442918400.0,
+            "680": 442918400.0,
+            "681": 442918400.0,
+            "682": 442918400.0,
+            "683": 442918400.0,
+            "684": 442918400.0,
+            "685": 442918400.0,
+            "686": 442918400.0,
+            "687": 442918400.0,
+            "688": 442918400.0,
+            "689": 442918400.0,
+            "690": 442918400.0,
+            "691": 442918400.0,
+            "692": 442918400.0,
+            "693": 442918400.0,
+            "694": 442918400.0,
+            "695": 442918400.0,
+            "696": 442918400.0,
+            "697": 442918400.0,
+            "698": 442918400.0,
+            "699": 442918400.0,
+            "700": 442918400.0,
+            "701": 442918400.0,
+            "702": 442918400.0,
+            "703": 442918400.0,
+            "704": 442918400.0,
+            "705": 442918400.0,
+            "706": 442918400.0,
+            "707": 442918400.0,
+            "708": 442918400.0,
+            "709": 442918400.0,
+            "710": 442918400.0,
+            "711": 442918400.0,
+            "712": 442918400.0,
+            "713": 442918400.0,
+            "714": 442918400.0,
+            "715": 442918400.0,
+            "716": 442918400.0,
+            "717": 442918400.0,
+            "718": 442918400.0,
+            "719": 442918400.0,
+            "720": 442918400.0,
+            "721": 442918400.0,
+            "722": 442918400.0,
+            "723": 442918400.0,
+            "724": 442918400.0,
+            "725": 442918400.0,
+            "726": 442918400.0,
+            "727": 442918400.0,
+            "728": 442918400.0,
+            "729": 442918400.0,
+            "730": 442918400.0,
+            "731": 442918400.0,
+            "732": 442918400.0,
+            "733": 442918400.0,
+            "734": 442918400.0,
+            "735": 442918400.0,
+            "736": 442918400.0,
+            "737": 442918400.0,
+            "738": 442918400.0,
+            "739": 442918400.0,
+            "740": 442918400.0,
+            "741": 442918400.0,
+            "742": 442918400.0,
+            "743": 442918400.0,
+            "744": 442918400.0,
+            "745": 442918400.0,
+            "746": 442918400.0,
+            "747": 442918400.0,
+            "748": 442918400.0,
+            "749": 442918400.0,
+            "750": 442918400.0,
+            "751": 442918400.0,
+            "752": 442918400.0,
+            "753": 442918400.0,
+            "754": 442918400.0,
+            "755": 442918400.0,
+            "756": 442918400.0,
+            "757": 442918400.0,
+            "758": 442918400.0,
+            "759": 442918400.0,
+            "760": 442918400.0,
+            "761": 442918400.0,
+            "762": 442918400.0,
+            "763": 442918400.0,
+            "764": 442918400.0,
+            "765": 442918400.0,
+            "766": 442918400.0,
+            "767": 442918400.0,
+            "768": 442918400.0,
+            "769": 442918400.0,
+            "770": 442918400.0,
+            "771": 442918400.0,
+            "772": 442918400.0,
+            "773": 442918400.0,
+            "774": 442918400.0,
+            "775": 442918400.0,
+            "776": 442918400.0,
+            "777": 442918400.0,
+            "778": 442918400.0,
+            "779": 442918400.0,
+            "780": 442918400.0,
+            "781": 442918400.0,
+            "782": 442918400.0,
+            "783": 442918400.0,
+            "784": 442918400.0,
+            "785": 442918400.0,
+            "786": 442918400.0,
+            "787": 442918400.0,
+            "788": 442918400.0,
+            "789": 442918400.0,
+            "790": 442918400.0,
+            "791": 442918400.0,
+            "792": 442918400.0,
+            "793": 442918400.0,
+            "794": 442918400.0,
+            "795": 442918400.0,
+            "796": 442918400.0,
+            "797": 442918400.0,
+            "798": 442918400.0,
+            "799": 442918400.0,
+            "800": 442918400.0,
+            "801": 442918400.0,
+            "802": 442918400.0,
+            "803": 442918400.0,
+            "804": 442918400.0,
+            "805": 442918400.0,
+            "806": 442918400.0,
+            "807": 442918400.0,
+            "808": 442918400.0,
+            "809": 442918400.0,
+            "810": 442918400.0,
+            "811": 442918400.0,
+            "812": 442918400.0,
+            "813": 442918400.0,
+            "814": 442918400.0,
+            "815": 442918400.0,
+            "816": 442918400.0,
+            "817": 442918400.0,
+            "818": 442918400.0,
+            "819": 442918400.0,
+            "820": 442918400.0,
+            "821": 442918400.0,
+            "822": 442918400.0,
+            "823": 442918400.0,
+            "824": 442918400.0,
+            "825": 442918400.0,
+            "826": 442918400.0,
+            "827": 442918400.0,
+            "828": 442918400.0,
+            "829": 442918400.0,
+            "830": 442918400.0,
+            "831": 442918400.0,
+            "832": 442918400.0,
+            "833": 442918400.0,
+            "834": 442918400.0,
+            "835": 442918400.0,
+            "836": 442918400.0,
+            "837": 442918400.0,
+            "838": 442918400.0,
+            "839": 442918400.0,
+            "840": 442918400.0,
+            "841": 442918400.0,
+            "842": 442918400.0,
+            "843": 442918400.0,
+            "844": 442918400.0,
+            "845": 442918400.0,
+            "846": 442918400.0,
+            "847": 442918400.0,
+            "848": 442918400.0,
+            "849": 442918400.0,
+            "850": 442918400.0,
+            "851": 442918400.0,
+            "852": 442918400.0,
+            "853": 442918400.0,
+            "854": 442918400.0,
+            "855": 442918400.0,
+            "856": 442918400.0,
+            "857": 442918400.0,
+            "858": 442918400.0,
+            "859": 442918400.0,
+            "860": 442918400.0,
+            "861": 442918400.0,
+            "862": 442918400.0,
+            "863": 442918400.0,
+            "864": 442918400.0,
+            "865": 442918400.0,
+            "866": 442918400.0,
+            "867": 442918400.0,
+            "868": 442918400.0,
+            "869": 442918400.0,
+            "870": 442918400.0,
+            "871": 442918400.0,
+            "872": 442918400.0,
+            "873": 442918400.0,
+            "874": 442918400.0,
+            "875": 442918400.0,
+            "876": 442918400.0,
+            "877": 442918400.0,
+            "878": 442918400.0,
+            "879": 442918400.0,
+            "880": 442918400.0,
+            "881": 442918400.0,
+            "882": 442918400.0,
+            "883": 442918400.0,
+            "884": 442918400.0,
+            "885": 442918400.0,
+            "886": 442918400.0,
+            "887": 442918400.0,
+            "888": 442918400.0,
+            "889": 442918400.0,
+            "890": 442918400.0,
+            "891": 442918400.0,
+            "892": 442918400.0,
+            "893": 442918400.0,
+            "894": 442918400.0,
+            "895": 442918400.0,
+            "896": 442918400.0,
+            "897": 442918400.0,
+            "898": 442918400.0,
+            "899": 442918400.0,
+            "900": 442918400.0,
+            "901": 442918400.0,
+            "902": 442918400.0,
+            "903": 442918400.0,
+            "904": 442918400.0,
+            "905": 442918400.0,
+            "906": 442918400.0,
+            "907": 442918400.0,
+            "908": 442918400.0,
+            "909": 442918400.0,
+            "910": 442918400.0,
+            "911": 442918400.0,
+            "912": 442918400.0,
+            "913": 442918400.0,
+            "914": 442918400.0,
+            "915": 442918400.0,
+            "916": 442918400.0,
+            "917": 442918400.0,
+            "918": 442918400.0,
+            "919": 442918400.0,
+            "920": 442918400.0,
+            "921": 442918400.0,
+            "922": 442918400.0,
+            "923": 442918400.0,
+            "924": 442918400.0,
+            "925": 442918400.0,
+            "926": 442918400.0,
+            "927": 442918400.0,
+            "928": 442918400.0,
+            "929": 442918400.0,
+            "930": 442918400.0,
+            "931": 442918400.0,
+            "932": 442918400.0,
+            "933": 442918400.0,
+            "934": 442918400.0,
+            "935": 442918400.0,
+            "936": 442918400.0,
+            "937": 442918400.0,
+            "938": 442918400.0,
+            "939": 442918400.0,
+            "940": 442918400.0,
+            "941": 442918400.0,
+            "942": 442918400.0,
+            "943": 442918400.0,
+            "944": 442918400.0,
+            "945": 442918400.0,
+            "946": 442918400.0,
+            "947": 442918400.0,
+            "948": 442918400.0,
+            "949": 442918400.0,
+            "950": 442918400.0,
+            "951": 442918400.0,
+            "952": 442918400.0,
+            "953": 442918400.0,
+            "954": 442918400.0,
+            "955": 442918400.0,
+            "956": 442918400.0,
+            "957": 442918400.0,
+            "958": 442918400.0,
+            "959": 442918400.0,
+            "960": 442918400.0,
+            "961": 442918400.0,
+            "962": 442918400.0,
+            "963": 442918400.0,
+            "964": 442918400.0,
+            "965": 442918400.0,
+            "966": 442918400.0,
+            "967": 442918400.0,
+            "968": 442918400.0,
+            "969": 442918400.0,
+            "970": 442918400.0,
+            "971": 442918400.0,
+            "972": 442918400.0,
+            "973": 442918400.0,
+            "974": 442918400.0,
+            "975": 442918400.0,
+            "976": 442918400.0,
+            "977": 442918400.0,
+            "978": 442918400.0,
+            "979": 442918400.0,
+            "980": 442918400.0,
+            "981": 442918400.0,
+            "982": 442918400.0,
+            "983": 442918400.0,
+            "984": 442918400.0,
+            "985": 442918400.0,
+            "986": 442918400.0,
+            "987": 442918400.0,
+            "988": 442918400.0,
+            "989": 442918400.0,
+            "990": 442918400.0,
+            "991": 442918400.0,
+            "992": 442918400.0,
+            "993": 442918400.0,
+            "994": 442918400.0,
+            "995": 442918400.0,
+            "996": 442918400.0,
+            "997": 442918400.0,
+            "998": 442918400.0,
+            "999": 442918400.0,
+            "1000": 442918400.0,
+            "1001": 442918400.0,
+            "1002": 442918400.0,
+            "1003": 442918400.0,
+            "1004": 442918400.0,
+            "1005": 442918400.0,
+            "1006": 442918400.0,
+            "1007": 442918400.0,
+            "1008": 442918400.0,
+            "1009": 442918400.0,
+            "1010": 442918400.0,
+            "1011": 442918400.0,
+            "1012": 442918400.0,
+            "1013": 442918400.0,
+            "1014": 442918400.0,
+            "1015": 442918400.0,
+            "1016": 442918400.0,
+            "1017": 442918400.0,
+            "1018": 442918400.0,
+            "1019": 442918400.0,
+            "1020": 442918400.0,
+            "1021": 442918400.0,
+            "1022": 442918400.0,
+            "1023": 442918400.0,
+            "1024": 442918400.0,
+            "1025": 442918400.0,
+            "1026": 442918400.0,
+            "1027": 442918400.0,
+            "1028": 442918400.0,
+            "1029": 442918400.0,
+            "1030": 442918400.0,
+            "1031": 442918400.0,
+            "1032": 442918400.0,
+            "1033": 442918400.0,
+            "1034": 442918400.0,
+            "1035": 442918400.0,
+            "1036": 442918400.0,
+            "1037": 442918400.0,
+            "1038": 442918400.0,
+            "1039": 442918400.0,
+            "1040": 442918400.0,
+            "1041": 442918400.0,
+            "1042": 442918400.0,
+            "1043": 442918400.0,
+            "1044": 442918400.0,
+            "1045": 442918400.0,
+            "1046": 442918400.0,
+            "1047": 442918400.0,
+            "1048": 442918400.0,
+            "1049": 442918400.0,
+            "1050": 442918400.0,
+            "1051": 442918400.0,
+            "1052": 442918400.0,
+            "1053": 442918400.0,
+            "1054": 442918400.0,
+            "1055": 442918400.0,
+            "1056": 442918400.0,
+            "1057": 442918400.0,
+            "1058": 442918400.0,
+            "1059": 442918400.0,
+            "1060": 442918400.0,
+            "1061": 442918400.0,
+            "1062": 442918400.0,
+            "1063": 442918400.0,
+            "1064": 442918400.0,
+            "1065": 442918400.0,
+            "1066": 442918400.0,
+            "1067": 442918400.0,
+            "1068": 442918400.0,
+            "1069": 442918400.0,
+            "1070": 442918400.0,
+            "1071": 442918400.0,
+            "1072": 442918400.0,
+            "1073": 442918400.0,
+            "1074": 442918400.0,
+            "1075": 442918400.0,
+            "1076": 442918400.0,
+            "1077": 442918400.0,
+            "1078": 442918400.0,
+            "1079": 442918400.0,
+            "1080": 442918400.0,
+            "1081": 442918400.0,
+            "1082": 442918400.0,
+            "1083": 442918400.0,
+            "1084": 442918400.0,
+            "1085": 442918400.0,
+            "1086": 442918400.0,
+            "1087": 442918400.0,
+            "1088": 442918400.0,
+            "1089": 442918400.0,
+            "1090": 442918400.0,
+            "1091": 442918400.0,
+            "1092": 442918400.0,
+            "1093": 442918400.0,
+            "1094": 442918400.0,
+            "1095": 442918400.0,
+            "1096": 442918400.0,
+            "1097": 442918400.0,
+            "1098": 442918400.0,
+            "1099": 442918400.0,
+            "1100": 442918400.0,
+            "1101": 442918400.0,
+            "1102": 442918400.0,
+            "1103": 442918400.0,
+            "1104": 442918400.0,
+            "1105": 442918400.0,
+            "1106": 442918400.0,
+            "1107": 442918400.0,
+            "1108": 442918400.0,
+            "1109": 442918400.0,
+            "1110": 442918400.0,
+            "1111": 442918400.0,
+            "1112": 442918400.0,
+            "1113": 442918400.0,
+            "1114": 442918400.0,
+            "1115": 442918400.0,
+            "1116": 442918400.0,
+            "1117": 442918400.0,
+            "1118": 442918400.0,
+            "1119": 442918400.0,
+            "1120": 442918400.0,
+            "1121": 442918400.0,
+            "1122": 442918400.0,
+            "1123": 442918400.0,
+            "1124": 442918400.0,
+            "1125": 442918400.0,
+            "1126": 442918400.0,
+            "1127": 442918400.0,
+            "1128": 442918400.0,
+            "1129": 442918400.0,
+            "1130": 442918400.0,
+            "1131": 442918400.0,
+            "1132": 442918400.0,
+            "1133": 442918400.0,
+            "1134": 442918400.0,
+            "1135": 442918400.0,
+            "1136": 442918400.0,
+            "1137": 442918400.0,
+            "1138": 442918400.0,
+            "1139": 442918400.0,
+            "1140": 442918400.0,
+            "1141": 442918400.0,
+            "1142": 442918400.0,
+            "1143": 442918400.0,
+            "1144": 442918400.0,
+            "1145": 442918400.0,
+            "1146": 442918400.0,
+            "1147": 442918400.0,
+            "1148": 442918400.0,
+            "1149": 442918400.0,
+            "1150": 442918400.0,
+            "1151": 442918400.0,
+            "1152": 442918400.0,
+            "1153": 442918400.0,
+            "1154": 442918400.0,
+            "1155": 442918400.0,
+            "1156": 442918400.0,
+            "1157": 442918400.0,
+            "1158": 442918400.0,
+            "1159": 442918400.0,
+            "1160": 442918400.0,
+            "1161": 442918400.0,
+            "1162": 442918400.0,
+            "1163": 442918400.0,
+            "1164": 442918400.0,
+            "1165": 442918400.0,
+            "1166": 442918400.0,
+            "1167": 442918400.0,
+            "1168": 442918400.0,
+            "1169": 442918400.0,
+            "1170": 442918400.0,
+            "1171": 442918400.0,
+            "1172": 442918400.0,
+            "1173": 442918400.0,
+            "1174": 442918400.0,
+            "1175": 442918400.0,
+            "1176": 442918400.0,
+            "1177": 442918400.0,
+            "1178": 442918400.0,
+            "1179": 442918400.0,
+            "1180": 442918400.0,
+            "1181": 442918400.0,
+            "1182": 442918400.0,
+            "1183": 442918400.0,
+            "1184": 442918400.0,
+            "1185": 442918400.0,
+            "1186": 442918400.0,
+            "1187": 442918400.0,
+            "1188": 442918400.0,
+            "1189": 442918400.0,
+            "1190": 442918400.0,
+            "1191": 442918400.0,
+            "1192": 442918400.0,
+            "1193": 442918400.0,
+            "1194": 442918400.0,
+            "1195": 442918400.0,
+            "1196": 442918400.0,
+            "1197": 442918400.0,
+            "1198": 442918400.0,
+            "1199": 442918400.0,
+            "1200": 442918400.0,
+            "1201": 442918400.0,
+            "1202": 442918400.0,
+            "1203": 442918400.0,
+            "1204": 442918400.0,
+            "1205": 442918400.0,
+            "1206": 442918400.0,
+            "1207": 442918400.0,
+            "1208": 442918400.0,
+            "1209": 442918400.0,
+            "1210": 442918400.0,
+            "1211": 442918400.0,
+            "1212": 442918400.0,
+            "1213": 442918400.0,
+            "1214": 442918400.0,
+            "1215": 442918400.0,
+            "1216": 442918400.0,
+            "1217": 442918400.0,
+            "1218": 442918400.0,
+            "1219": 442918400.0,
+            "1220": 442918400.0,
+            "1221": 442918400.0,
+            "1222": 442918400.0,
+            "1223": 442918400.0,
+            "1224": 442918400.0,
+            "1225": 442918400.0,
+            "1226": 442918400.0,
+            "1227": 442918400.0,
+            "1228": 442918400.0,
+            "1229": 442918400.0,
+            "1230": 442918400.0,
+            "1231": 442918400.0,
+            "1232": 442918400.0,
+            "1233": 442918400.0,
+            "1234": 442918400.0,
+            "1235": 442918400.0,
+            "1236": 442918400.0,
+            "1237": 442918400.0,
+            "1238": 442918400.0,
+            "1239": 442918400.0,
+            "1240": 442918400.0,
+            "1241": 442918400.0,
+            "1242": 442918400.0,
+            "1243": 442918400.0,
+            "1244": 442918400.0,
+            "1245": 442918400.0,
+            "1246": 442918400.0,
+            "1247": 442918400.0,
+            "1248": 442918400.0,
+            "1249": 442918400.0,
+            "1250": 442918400.0,
+            "1251": 442918400.0,
+            "1252": 442918400.0,
+            "1253": 442918400.0,
+            "1254": 442918400.0,
+            "1255": 442918400.0,
+            "1256": 442918400.0,
+            "1257": 442918400.0,
+            "1258": 442918400.0,
+            "1259": 442918400.0,
+            "1260": 442918400.0,
+            "1261": 442918400.0,
+            "1262": 442918400.0,
+            "1263": 442918400.0,
+            "1264": 442918400.0,
+            "1265": 442918400.0,
+            "1266": 442918400.0,
+            "1267": 442918400.0,
+            "1268": 442918400.0,
+            "1269": 442918400.0,
+            "1270": 442918400.0,
+            "1271": 442918400.0,
+            "1272": 442918400.0,
+            "1273": 442918400.0,
+            "1274": 442918400.0,
+            "1275": 442918400.0,
+            "1276": 442918400.0,
+            "1277": 442918400.0,
+            "1278": 442918400.0,
+            "1279": 442918400.0,
+            "1280": 442918400.0,
+            "1281": 442918400.0,
+            "1282": 442918400.0,
+            "1283": 442918400.0,
+            "1284": 442918400.0,
+            "1285": 442918400.0,
+            "1286": 442918400.0,
+            "1287": 442918400.0,
+            "1288": 442918400.0,
+            "1289": 442918400.0,
+            "1290": 442918400.0,
+            "1291": 442918400.0,
+            "1292": 442918400.0,
+            "1293": 442918400.0,
+            "1294": 442918400.0,
+            "1295": 442918400.0,
+            "1296": 442918400.0,
+            "1297": 442918400.0,
+            "1298": 442918400.0,
+            "1299": 442918400.0,
+            "1300": 442918400.0,
+            "1301": 442918400.0,
+            "1302": 442918400.0,
+            "1303": 442918400.0,
+            "1304": 442918400.0,
+            "1305": 442918400.0,
+            "1306": 442918400.0,
+            "1307": 442918400.0,
+            "1308": 442918400.0,
+            "1309": 442918400.0,
+            "1310": 442918400.0,
+            "1311": 442918400.0,
+            "1312": 442918400.0,
+            "1313": 442918400.0,
+            "1314": 442918400.0,
+            "1315": 442918400.0,
+            "1316": 442918400.0,
+            "1317": 442918400.0,
+            "1318": 442918400.0,
+            "1319": 442918400.0,
+            "1320": 442918400.0,
+            "1321": 442918400.0,
+            "1322": 442918400.0,
+            "1323": 442918400.0,
+            "1324": 442918400.0,
+            "1325": 442918400.0,
+            "1326": 442918400.0,
+            "1327": 442918400.0,
+            "1328": 442918400.0,
+            "1329": 442918400.0,
+            "1330": 442918400.0,
+            "1331": 442918400.0,
+            "1332": 442918400.0,
+            "1333": 442918400.0,
+            "1334": 442918400.0,
+            "1335": 442918400.0,
+            "1336": 442918400.0,
+            "1337": 442918400.0,
+            "1338": 442918400.0,
+            "1339": 442918400.0,
+            "1340": 442918400.0,
+            "1341": 442918400.0,
+            "1342": 442918400.0,
+            "1343": 442918400.0,
+            "1344": 442918400.0,
+            "1345": 442918400.0,
+            "1346": 442918400.0,
+            "1347": 442918400.0,
+            "1348": 442918400.0,
+            "1349": 442918400.0,
+            "1350": 442918400.0,
+            "1351": 442918400.0,
+            "1352": 442918400.0,
+            "1353": 442918400.0,
+            "1354": 442918400.0,
+            "1355": 442918400.0,
+            "1356": 442918400.0,
+            "1357": 442918400.0,
+            "1358": 442918400.0,
+            "1359": 442918400.0,
+            "1360": 442918400.0,
+            "1361": 442918400.0,
+            "1362": 442918400.0,
+            "1363": 442918400.0,
+            "1364": 442918400.0,
+            "1365": 442918400.0,
+            "1366": 442918400.0,
+            "1367": 442918400.0,
+            "1368": 442918400.0,
+            "1369": 442918400.0,
+            "1370": 442918400.0,
+            "1371": 442918400.0,
+            "1372": 442918400.0,
+            "1373": 442918400.0,
+            "1374": 442918400.0,
+            "1375": 442918400.0,
+            "1376": 442918400.0,
+            "1377": 442918400.0,
+            "1378": 442918400.0,
+            "1379": 442918400.0,
+            "1380": 442918400.0,
+            "1381": 442918400.0,
+            "1382": 442918400.0,
+            "1383": 442918400.0,
+            "1384": 442918400.0,
+            "1385": 442918400.0,
+            "1386": 442918400.0,
+            "1387": 442918400.0,
+            "1388": 442918400.0,
+            "1389": 442918400.0,
+            "1390": 442918400.0,
+            "1391": 442918400.0,
+            "1392": 442918400.0,
+            "1393": 442918400.0,
+            "1394": 442918400.0,
+            "1395": 442918400.0,
+            "1396": 442918400.0,
+            "1397": 442918400.0,
+            "1398": 442918400.0,
+            "1399": 442918400.0,
+            "1400": 442918400.0,
+            "1401": 442918400.0,
+            "1402": 442918400.0,
+            "1403": 442918400.0,
+            "1404": 442918400.0,
+            "1405": 442918400.0,
+            "1406": 442918400.0,
+            "1407": 442918400.0,
+            "1408": 442918400.0,
+            "1409": 442918400.0,
+            "1410": 442918400.0,
+            "1411": 442918400.0,
+            "1412": 442918400.0,
+            "1413": 442918400.0,
+            "1414": 442918400.0,
+            "1415": 442918400.0,
+            "1416": 442918400.0,
+            "1417": 442918400.0,
+            "1418": 442918400.0,
+            "1419": 442918400.0,
+            "1420": 442918400.0,
+            "1421": 442918400.0,
+            "1422": 442918400.0,
+            "1423": 442918400.0,
+            "1424": 442918400.0,
+            "1425": 442918400.0,
+            "1426": 442918400.0,
+            "1427": 442918400.0,
+            "1428": 442918400.0,
+            "1429": 442918400.0,
+            "1430": 442918400.0,
+            "1431": 442918400.0,
+            "1432": 442918400.0,
+            "1433": 442918400.0,
+            "1434": 442918400.0,
+            "1435": 442918400.0,
+            "1436": 442918400.0,
+            "1437": 442918400.0,
+            "1438": 442918400.0,
+            "1439": 442918400.0,
+            "1440": 442918400.0,
+            "1441": 442918400.0,
+            "1442": 442918400.0,
+            "1443": 442918400.0,
+            "1444": 442918400.0,
+            "1445": 442918400.0,
+            "1446": 442918400.0,
+            "1447": 442918400.0,
+            "1448": 442918400.0,
+            "1449": 442918400.0,
+            "1450": 442918400.0,
+            "1451": 442918400.0,
+            "1452": 442918400.0,
+            "1453": 442918400.0,
+            "1454": 442918400.0,
+            "1455": 442918400.0,
+            "1456": 442918400.0,
+            "1457": 442918400.0,
+            "1458": 442918400.0,
+            "1459": 442918400.0,
+            "1460": 442918400.0,
+            "1461": 442918400.0,
+            "1462": 442918400.0,
+            "1463": 442918400.0,
+            "1464": 442918400.0,
+            "1465": 442918400.0,
+            "1466": 442918400.0,
+            "1467": 442918400.0,
+            "1468": 442918400.0,
+            "1469": 442918400.0,
+            "1470": 442918400.0,
+            "1471": 442918400.0,
+            "1472": 442918400.0,
+            "1473": 442918400.0,
+            "1474": 442918400.0,
+            "1475": 442918400.0,
+            "1476": 442918400.0,
+            "1477": 442918400.0,
+            "1478": 442918400.0,
+            "1479": 442918400.0,
+            "1480": 442918400.0,
+            "1481": 442918400.0,
+            "1482": 442918400.0,
+            "1483": 442918400.0,
+            "1484": 442918400.0,
+            "1485": 442918400.0,
+            "1486": 442918400.0,
+            "1487": 442918400.0,
+            "1488": 442918400.0,
+            "1489": 442918400.0,
+            "1490": 442918400.0,
+            "1491": 442918400.0,
+            "1492": 442918400.0,
+            "1493": 442918400.0,
+            "1494": 442918400.0,
+            "1495": 442918400.0,
+            "1496": 442918400.0,
+            "1497": 442918400.0,
+            "1498": 442918400.0,
+            "1499": 442918400.0,
+            "1500": 442918400.0,
+            "1501": 442918400.0,
+            "1502": 442918400.0,
+            "1503": 442918400.0,
+            "1504": 442918400.0,
+            "1505": 442918400.0,
+            "1506": 442918400.0,
+            "1507": 442918400.0,
+            "1508": 442918400.0,
+            "1509": 442918400.0,
+            "1510": 442918400.0,
+            "1511": 442918400.0,
+            "1512": 442918400.0,
+            "1513": 442918400.0,
+            "1514": 442918400.0,
+            "1515": 442918400.0,
+            "1516": 442918400.0,
+            "1517": 442918400.0,
+            "1518": 442918400.0,
+            "1519": 442918400.0,
+            "1520": 442918400.0,
+            "1521": 442918400.0,
+            "1522": 442918400.0,
+            "1523": 442918400.0,
+            "1524": 442918400.0,
+            "1525": 442918400.0,
+            "1526": 442918400.0,
+            "1527": 442918400.0,
+            "1528": 442918400.0,
+            "1529": 442918400.0,
+            "1530": 442918400.0,
+            "1531": 442918400.0,
+            "1532": 442918400.0,
+            "1533": 442918400.0,
+            "1534": 442918400.0,
+            "1535": 442918400.0,
+            "1536": 442918400.0,
+            "1537": 442918400.0,
+            "1538": 442918400.0,
+            "1539": 442918400.0,
+            "1540": 442918400.0,
+            "1541": 442918400.0,
+            "1542": 442918400.0,
+            "1543": 442918400.0,
+            "1544": 442918400.0,
+            "1545": 442918400.0,
+            "1546": 442918400.0,
+            "1547": 442918400.0,
+            "1548": 442918400.0,
+            "1549": 442918400.0,
+            "1550": 442918400.0,
+            "1551": 442918400.0,
+            "1552": 442918400.0,
+            "1553": 442918400.0,
+            "1554": 442918400.0,
+            "1555": 442918400.0,
+            "1556": 442918400.0,
+            "1557": 442918400.0,
+            "1558": 442918400.0,
+            "1559": 442918400.0,
+            "1560": 442918400.0,
+            "1561": 442918400.0,
+            "1562": 442918400.0,
+            "1563": 442918400.0,
+            "1564": 442918400.0,
+            "1565": 442918400.0,
+            "1566": 442918400.0,
+            "1567": 442918400.0,
+            "1568": 442918400.0,
+            "1569": 442918400.0,
+            "1570": 442918400.0,
+            "1571": 442918400.0,
+            "1572": 442918400.0,
+            "1573": 442918400.0,
+            "1574": 442918400.0,
+            "1575": 442918400.0,
+            "1576": 442918400.0,
+            "1577": 442918400.0,
+            "1578": 442918400.0,
+            "1579": 442918400.0,
+            "1580": 442918400.0,
+            "1581": 442918400.0,
+            "1582": 442918400.0,
+            "1583": 442918400.0,
+            "1584": 442918400.0,
+            "1585": 442918400.0,
+            "1586": 442918400.0,
+            "1587": 442918400.0,
+            "1588": 442918400.0,
+            "1589": 442918400.0,
+            "1590": 442918400.0,
+            "1591": 442918400.0,
+            "1592": 442918400.0,
+            "1593": 442918400.0,
+            "1594": 442918400.0,
+            "1595": 442918400.0,
+            "1596": 442918400.0,
+            "1597": 442918400.0,
+            "1598": 442918400.0,
+            "1599": 442918400.0,
+            "1600": 442918400.0,
+            "1601": 442918400.0,
+            "1602": 442918400.0,
+            "1603": 442918400.0,
+            "1604": 442918400.0,
+            "1605": 442918400.0,
+            "1606": 442918400.0,
+            "1607": 442918400.0,
+            "1608": 442918400.0,
+            "1609": 442918400.0,
+            "1610": 442918400.0,
+            "1611": 442918400.0,
+            "1612": 442918400.0,
+            "1613": 442918400.0,
+            "1614": 442918400.0,
+            "1615": 442918400.0,
+            "1616": 442918400.0,
+            "1617": 442918400.0,
+            "1618": 442918400.0,
+            "1619": 442918400.0,
+            "1620": 442918400.0,
+            "1621": 442918400.0,
+            "1622": 442918400.0,
+            "1623": 442918400.0,
+            "1624": 442918400.0,
+            "1625": 442918400.0,
+            "1626": 442918400.0,
+            "1627": 442918400.0,
+            "1628": 442918400.0,
+            "1629": 442918400.0,
+            "1630": 442918400.0,
+            "1631": 442918400.0,
+            "1632": 442918400.0,
+            "1633": 442918400.0,
+            "1634": 442918400.0,
+            "1635": 442918400.0,
+            "1636": 442918400.0,
+            "1637": 442918400.0,
+            "1638": 442918400.0,
+            "1639": 442918400.0,
+            "1640": 442918400.0,
+            "1641": 442918400.0,
+            "1642": 442918400.0,
+            "1643": 442918400.0,
+            "1644": 442918400.0,
+            "1645": 442918400.0,
+            "1646": 442918400.0,
+            "1647": 442918400.0,
+            "1648": 442918400.0,
+            "1649": 442918400.0,
+            "1650": 442918400.0,
+            "1651": 442918400.0,
+            "1652": 442918400.0,
+            "1653": 442918400.0,
+            "1654": 442918400.0,
+            "1655": 442918400.0,
+            "1656": 442918400.0,
+            "1657": 442918400.0,
+            "1658": 442918400.0,
+            "1659": 442918400.0,
+            "1660": 442918400.0,
+            "1661": 442918400.0,
+            "1662": 442918400.0,
+            "1663": 442918400.0,
+            "1664": 442918400.0,
+            "1665": 442918400.0,
+            "1666": 442918400.0,
+            "1667": 442918400.0,
+            "1668": 442918400.0,
+            "1669": 442918400.0,
+            "1670": 442918400.0,
+            "1671": 442918400.0,
+            "1672": 442918400.0,
+            "1673": 442918400.0,
+            "1674": 442918400.0,
+            "1675": 442918400.0,
+            "1676": 442918400.0,
+            "1677": 442918400.0,
+            "1678": 442918400.0,
+            "1679": 442918400.0,
+            "1680": 442918400.0,
+            "1681": 442918400.0,
+            "1682": 442918400.0,
+            "1683": 442918400.0,
+            "1684": 442918400.0,
+            "1685": 442918400.0,
+            "1686": 442918400.0,
+            "1687": 442918400.0,
+            "1688": 442918400.0,
+            "1689": 442918400.0,
+            "1690": 442918400.0,
+            "1691": 442918400.0,
+            "1692": 442918400.0,
+            "1693": 442918400.0,
+            "1694": 442918400.0,
+            "1695": 442918400.0,
+            "1696": 442918400.0,
+            "1697": 442918400.0,
+            "1698": 442918400.0,
+            "1699": 442918400.0,
+            "1700": 442918400.0,
+            "1701": 442918400.0,
+            "1702": 442918400.0,
+            "1703": 442918400.0,
+            "1704": 442918400.0,
+            "1705": 442918400.0,
+            "1706": 442918400.0,
+            "1707": 442918400.0,
+            "1708": 442918400.0,
+            "1709": 442918400.0,
+            "1710": 442918400.0,
+            "1711": 442918400.0,
+            "1712": 442918400.0,
+            "1713": 442918400.0,
+            "1714": 442918400.0,
+            "1715": 442918400.0,
+            "1716": 442918400.0,
+            "1717": 442918400.0,
+            "1718": 442918400.0,
+            "1719": 442918400.0,
+            "1720": 442918400.0,
+            "1721": 442918400.0,
+            "1722": 442918400.0,
+            "1723": 442918400.0,
+            "1724": 442918400.0,
+            "1725": 442918400.0,
+            "1726": 442918400.0,
+            "1727": 442918400.0,
+            "1728": 442918400.0,
+            "1729": 442918400.0,
+            "1730": 442918400.0,
+            "1731": 442918400.0,
+            "1732": 442918400.0,
+            "1733": 442918400.0,
+            "1734": 442918400.0,
+            "1735": 442918400.0,
+            "1736": 442918400.0,
+            "1737": 442918400.0,
+            "1738": 442918400.0,
+            "1739": 442918400.0,
+            "1740": 442918400.0,
+            "1741": 442918400.0,
+            "1742": 442918400.0,
+            "1743": 442918400.0,
+            "1744": 442918400.0,
+            "1745": 442918400.0,
+            "1746": 442918400.0,
+            "1747": 442918400.0,
+            "1748": 442918400.0,
+            "1749": 442918400.0,
+            "1750": 442918400.0,
+            "1751": 442918400.0,
+            "1752": 442918400.0,
+            "1753": 442918400.0,
+            "1754": 442918400.0,
+            "1755": 442918400.0,
+            "1756": 442918400.0,
+            "1757": 442918400.0,
+            "1758": 442918400.0,
+            "1759": 442918400.0,
+            "1760": 442918400.0,
+            "1761": 442918400.0,
+            "1762": 442918400.0,
+            "1763": 442918400.0,
+            "1764": 442918400.0,
+            "1765": 442918400.0,
+            "1766": 442918400.0,
+            "1767": 442918400.0,
+            "1768": 442918400.0,
+            "1769": 442918400.0,
+            "1770": 442918400.0,
+            "1771": 442918400.0,
+            "1772": 442918400.0,
+            "1773": 442918400.0,
+            "1774": 442918400.0,
+            "1775": 442918400.0,
+            "1776": 442918400.0,
+            "1777": 442918400.0,
+            "1778": 442918400.0,
+            "1779": 442918400.0,
+            "1780": 442918400.0,
+            "1781": 442918400.0,
+            "1782": 442918400.0,
+            "1783": 442918400.0,
+            "1784": 442918400.0,
+            "1785": 442918400.0,
+            "1786": 442918400.0,
+            "1787": 442918400.0,
+            "1788": 442918400.0,
+            "1789": 442918400.0,
+            "1790": 442918400.0,
+            "1791": 442918400.0,
+            "1792": 442918400.0,
+            "1793": 442918400.0,
+            "1794": 442918400.0,
+            "1795": 442918400.0,
+            "1796": 442918400.0,
+            "1797": 442918400.0,
+            "1798": 442918400.0,
+            "1799": 442918400.0,
+            "1800": 442918400.0,
+            "1801": 442918400.0,
+            "1802": 442918400.0,
+            "1803": 442918400.0,
+            "1804": 442918400.0,
+            "1805": 442918400.0,
+            "1806": 442918400.0,
+            "1807": 442918400.0,
+            "1808": 442918400.0,
+            "1809": 442918400.0,
+            "1810": 442918400.0,
+            "1811": 442918400.0,
+            "1812": 442918400.0,
+            "1813": 442918400.0,
+            "1814": 442918400.0,
+            "1815": 442918400.0,
+            "1816": 442918400.0,
+            "1817": 442918400.0,
+            "1818": 442918400.0,
+            "1819": 442918400.0,
+            "1820": 442918400.0,
+            "1821": 442918400.0,
+            "1822": 442918400.0,
+            "1823": 442918400.0,
+            "1824": 442918400.0,
+            "1825": 442918400.0,
+            "1826": 442918400.0,
+            "1827": 442918400.0,
+            "1828": 442918400.0,
+            "1829": 442918400.0,
+            "1830": 442918400.0,
+            "1831": 442918400.0,
+            "1832": 442918400.0,
+            "1833": 442918400.0,
+            "1834": 442918400.0,
+            "1835": 442918400.0,
+            "1836": 442918400.0,
+            "1837": 442918400.0,
+            "1838": 442918400.0,
+            "1839": 442918400.0,
+            "1840": 442918400.0,
+            "1841": 442918400.0,
+            "1842": 442918400.0,
+            "1843": 442918400.0,
+            "1844": 442918400.0,
+            "1845": 442918400.0,
+            "1846": 442918400.0,
+            "1847": 442918400.0,
+            "1848": 442918400.0,
+            "1849": 442918400.0,
+            "1850": 442918400.0,
+            "1851": 442918400.0,
+            "1852": 442918400.0,
+            "1853": 442918400.0,
+            "1854": 442918400.0,
+            "1855": 442918400.0,
+            "1856": 442918400.0,
+            "1857": 442918400.0,
+            "1858": 442918400.0,
+            "1859": 442918400.0,
+            "1860": 442918400.0,
+            "1861": 442918400.0,
+            "1862": 442918400.0,
+            "1863": 442918400.0,
+            "1864": 442918400.0,
+            "1865": 442918400.0,
+            "1866": 442918400.0,
+            "1867": 442918400.0,
+            "1868": 442918400.0,
+            "1869": 442918400.0,
+            "1870": 442918400.0,
+            "1871": 442918400.0,
+            "1872": 442918400.0,
+            "1873": 442918400.0,
+            "1874": 442918400.0,
+            "1875": 442918400.0,
+            "1876": 442918400.0,
+            "1877": 442918400.0,
+            "1878": 442918400.0,
+            "1879": 442918400.0,
+            "1880": 442918400.0,
+            "1881": 442918400.0,
+            "1882": 442918400.0,
+            "1883": 442918400.0,
+            "1884": 442918400.0,
+            "1885": 442918400.0,
+            "1886": 442918400.0,
+            "1887": 442918400.0,
+            "1888": 442918400.0,
+            "1889": 442918400.0,
+            "1890": 442918400.0,
+            "1891": 442918400.0,
+            "1892": 442918400.0,
+            "1893": 442918400.0,
+            "1894": 442918400.0,
+            "1895": 442918400.0,
+            "1896": 442918400.0,
+            "1897": 442918400.0,
+            "1898": 442918400.0,
+            "1899": 442918400.0,
+            "1900": 442918400.0,
+            "1901": 442918400.0,
+            "1902": 442918400.0,
+            "1903": 442918400.0,
+            "1904": 442918400.0,
+            "1905": 442918400.0,
+            "1906": 442918400.0,
+            "1907": 442918400.0,
+            "1908": 442918400.0,
+            "1909": 442918400.0,
+            "1910": 442918400.0,
+            "1911": 442918400.0,
+            "1912": 442918400.0,
+            "1913": 442918400.0,
+            "1914": 442918400.0,
+            "1915": 442918400.0,
+            "1916": 442918400.0,
+            "1917": 442918400.0,
+            "1918": 442918400.0,
+            "1919": 442918400.0,
+            "1920": 442918400.0,
+            "1921": 442918400.0,
+            "1922": 442918400.0,
+            "1923": 442918400.0,
+            "1924": 442918400.0,
+            "1925": 442918400.0,
+            "1926": 442918400.0,
+            "1927": 442918400.0,
+            "1928": 442918400.0,
+            "1929": 442918400.0,
+            "1930": 442918400.0,
+            "1931": 442918400.0,
+            "1932": 442918400.0,
+            "1933": 442918400.0,
+            "1934": 442918400.0,
+            "1935": 442918400.0,
+            "1936": 442918400.0,
+            "1937": 442918400.0,
+            "1938": 442918400.0,
+            "1939": 442918400.0,
+            "1940": 442918400.0,
+            "1941": 442918400.0,
+            "1942": 442918400.0,
+            "1943": 442918400.0,
+            "1944": 442918400.0,
+            "1945": 442918400.0,
+            "1946": 442918400.0,
+            "1947": 442918400.0,
+            "1948": 442918400.0,
+            "1949": 442918400.0,
+            "1950": 442918400.0,
+            "1951": 442918400.0,
+            "1952": 442918400.0,
+            "1953": 442918400.0,
+            "1954": 442918400.0,
+            "1955": 442918400.0,
+            "1956": 442918400.0,
+            "1957": 442918400.0,
+            "1958": 442918400.0,
+            "1959": 442918400.0,
+            "1960": 442918400.0,
+            "1961": 442918400.0,
+            "1962": 442918400.0,
+            "1963": 442918400.0,
+            "1964": 442918400.0,
+            "1965": 442918400.0,
+            "1966": 442918400.0,
+            "1967": 442918400.0,
+            "1968": 442918400.0,
+            "1969": 442918400.0,
+            "1970": 442918400.0,
+            "1971": 442918400.0,
+            "1972": 442918400.0,
+            "1973": 442918400.0,
+            "1974": 442918400.0,
+            "1975": 442918400.0,
+            "1976": 442918400.0,
+            "1977": 442918400.0,
+            "1978": 442918400.0,
+            "1979": 442918400.0,
+            "1980": 442918400.0,
+            "1981": 442918400.0,
+            "1982": 442918400.0,
+            "1983": 442918400.0,
+            "1984": 442918400.0,
+            "1985": 442918400.0,
+            "1986": 442918400.0,
+            "1987": 442918400.0,
+            "1988": 442918400.0,
+            "1989": 442918400.0,
+            "1990": 442918400.0,
+            "1991": 442918400.0,
+            "1992": 442918400.0,
+            "1993": 442918400.0,
+            "1994": 442918400.0,
+            "1995": 442918400.0,
+            "1996": 442918400.0,
+            "1997": 442918400.0,
+            "1998": 442918400.0,
+            "1999": 442918400.0,
+            "2000": 442918400.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 761183744.0,
+            "2": 849621504.0,
+            "3": 849621504.0,
+            "4": 849621504.0,
+            "5": 849621504.0,
+            "6": 849621504.0,
+            "7": 849621504.0,
+            "8": 849621504.0,
+            "9": 849621504.0,
+            "10": 849621504.0,
+            "11": 849621504.0,
+            "12": 849621504.0,
+            "13": 849621504.0,
+            "14": 849621504.0,
+            "15": 849621504.0,
+            "16": 849621504.0,
+            "17": 849621504.0,
+            "18": 849621504.0,
+            "19": 849621504.0,
+            "20": 849621504.0,
+            "21": 849621504.0,
+            "22": 849621504.0,
+            "23": 849621504.0,
+            "24": 849621504.0,
+            "25": 849621504.0,
+            "26": 849621504.0,
+            "27": 849621504.0,
+            "28": 849621504.0,
+            "29": 849621504.0,
+            "30": 849621504.0,
+            "31": 849621504.0,
+            "32": 849621504.0,
+            "33": 849621504.0,
+            "34": 849621504.0,
+            "35": 849621504.0,
+            "36": 849621504.0,
+            "37": 849621504.0,
+            "38": 849621504.0,
+            "39": 849621504.0,
+            "40": 849621504.0,
+            "41": 849621504.0,
+            "42": 849621504.0,
+            "43": 849621504.0,
+            "44": 849621504.0,
+            "45": 849621504.0,
+            "46": 849621504.0,
+            "47": 849621504.0,
+            "48": 849621504.0,
+            "49": 849621504.0,
+            "50": 849621504.0,
+            "51": 849621504.0,
+            "52": 849621504.0,
+            "53": 849621504.0,
+            "54": 849621504.0,
+            "55": 849621504.0,
+            "56": 849621504.0,
+            "57": 849621504.0,
+            "58": 849621504.0,
+            "59": 849621504.0,
+            "60": 849621504.0,
+            "61": 849621504.0,
+            "62": 849621504.0,
+            "63": 849621504.0,
+            "64": 849621504.0,
+            "65": 849621504.0,
+            "66": 849621504.0,
+            "67": 849621504.0,
+            "68": 849621504.0,
+            "69": 849621504.0,
+            "70": 849621504.0,
+            "71": 849621504.0,
+            "72": 849621504.0,
+            "73": 849621504.0,
+            "74": 849621504.0,
+            "75": 849621504.0,
+            "76": 849621504.0,
+            "77": 849621504.0,
+            "78": 849621504.0,
+            "79": 849621504.0,
+            "80": 849621504.0,
+            "81": 849621504.0,
+            "82": 849621504.0,
+            "83": 849621504.0,
+            "84": 849621504.0,
+            "85": 849621504.0,
+            "86": 849621504.0,
+            "87": 849621504.0,
+            "88": 849621504.0,
+            "89": 849621504.0,
+            "90": 849621504.0,
+            "91": 849621504.0,
+            "92": 849621504.0,
+            "93": 849621504.0,
+            "94": 849621504.0,
+            "95": 849621504.0,
+            "96": 849621504.0,
+            "97": 849621504.0,
+            "98": 849621504.0,
+            "99": 849621504.0,
+            "100": 849621504.0,
+            "101": 849621504.0,
+            "102": 849621504.0,
+            "103": 849621504.0,
+            "104": 849621504.0,
+            "105": 849621504.0,
+            "106": 849621504.0,
+            "107": 849621504.0,
+            "108": 849621504.0,
+            "109": 849621504.0,
+            "110": 849621504.0,
+            "111": 849621504.0,
+            "112": 849621504.0,
+            "113": 849621504.0,
+            "114": 849621504.0,
+            "115": 849621504.0,
+            "116": 849621504.0,
+            "117": 849621504.0,
+            "118": 849621504.0,
+            "119": 849621504.0,
+            "120": 849621504.0,
+            "121": 849621504.0,
+            "122": 849621504.0,
+            "123": 849621504.0,
+            "124": 849621504.0,
+            "125": 849621504.0,
+            "126": 849621504.0,
+            "127": 849621504.0,
+            "128": 849621504.0,
+            "129": 849621504.0,
+            "130": 849621504.0,
+            "131": 849621504.0,
+            "132": 849621504.0,
+            "133": 849621504.0,
+            "134": 849621504.0,
+            "135": 849621504.0,
+            "136": 849621504.0,
+            "137": 849621504.0,
+            "138": 849621504.0,
+            "139": 849621504.0,
+            "140": 849621504.0,
+            "141": 849621504.0,
+            "142": 849621504.0,
+            "143": 849621504.0,
+            "144": 849621504.0,
+            "145": 849621504.0,
+            "146": 849621504.0,
+            "147": 849621504.0,
+            "148": 849621504.0,
+            "149": 849621504.0,
+            "150": 849621504.0,
+            "151": 849621504.0,
+            "152": 849621504.0,
+            "153": 849621504.0,
+            "154": 849621504.0,
+            "155": 849621504.0,
+            "156": 849621504.0,
+            "157": 849621504.0,
+            "158": 849621504.0,
+            "159": 849621504.0,
+            "160": 849621504.0,
+            "161": 849621504.0,
+            "162": 849621504.0,
+            "163": 849621504.0,
+            "164": 849621504.0,
+            "165": 849621504.0,
+            "166": 849621504.0,
+            "167": 849621504.0,
+            "168": 849621504.0,
+            "169": 849621504.0,
+            "170": 849621504.0,
+            "171": 849621504.0,
+            "172": 849621504.0,
+            "173": 849621504.0,
+            "174": 849621504.0,
+            "175": 849621504.0,
+            "176": 849621504.0,
+            "177": 849621504.0,
+            "178": 849621504.0,
+            "179": 849621504.0,
+            "180": 849621504.0,
+            "181": 849621504.0,
+            "182": 849621504.0,
+            "183": 849621504.0,
+            "184": 849621504.0,
+            "185": 849621504.0,
+            "186": 849621504.0,
+            "187": 849621504.0,
+            "188": 849621504.0,
+            "189": 849621504.0,
+            "190": 849621504.0,
+            "191": 849621504.0,
+            "192": 849621504.0,
+            "193": 849621504.0,
+            "194": 849621504.0,
+            "195": 849621504.0,
+            "196": 849621504.0,
+            "197": 849621504.0,
+            "198": 849621504.0,
+            "199": 849621504.0,
+            "200": 849621504.0,
+            "201": 849621504.0,
+            "202": 849621504.0,
+            "203": 849621504.0,
+            "204": 849621504.0,
+            "205": 849621504.0,
+            "206": 849621504.0,
+            "207": 849621504.0,
+            "208": 849621504.0,
+            "209": 849621504.0,
+            "210": 849621504.0,
+            "211": 849621504.0,
+            "212": 849621504.0,
+            "213": 849621504.0,
+            "214": 849621504.0,
+            "215": 849621504.0,
+            "216": 849621504.0,
+            "217": 849621504.0,
+            "218": 849621504.0,
+            "219": 849621504.0,
+            "220": 849621504.0,
+            "221": 849621504.0,
+            "222": 849621504.0,
+            "223": 849621504.0,
+            "224": 849621504.0,
+            "225": 849621504.0,
+            "226": 849621504.0,
+            "227": 849621504.0,
+            "228": 849621504.0,
+            "229": 849621504.0,
+            "230": 849621504.0,
+            "231": 849621504.0,
+            "232": 849621504.0,
+            "233": 849621504.0,
+            "234": 849621504.0,
+            "235": 849621504.0,
+            "236": 849621504.0,
+            "237": 849621504.0,
+            "238": 849621504.0,
+            "239": 849621504.0,
+            "240": 849621504.0,
+            "241": 849621504.0,
+            "242": 849621504.0,
+            "243": 849621504.0,
+            "244": 849621504.0,
+            "245": 849621504.0,
+            "246": 849621504.0,
+            "247": 849621504.0,
+            "248": 849621504.0,
+            "249": 849621504.0,
+            "250": 849621504.0,
+            "251": 849621504.0,
+            "252": 849621504.0,
+            "253": 849621504.0,
+            "254": 849621504.0,
+            "255": 849621504.0,
+            "256": 849621504.0,
+            "257": 849621504.0,
+            "258": 849621504.0,
+            "259": 849621504.0,
+            "260": 849621504.0,
+            "261": 849621504.0,
+            "262": 849621504.0,
+            "263": 849621504.0,
+            "264": 849621504.0,
+            "265": 849621504.0,
+            "266": 849621504.0,
+            "267": 849621504.0,
+            "268": 849621504.0,
+            "269": 849621504.0,
+            "270": 849621504.0,
+            "271": 849621504.0,
+            "272": 849621504.0,
+            "273": 849621504.0,
+            "274": 849621504.0,
+            "275": 849621504.0,
+            "276": 849621504.0,
+            "277": 849621504.0,
+            "278": 849621504.0,
+            "279": 849621504.0,
+            "280": 849621504.0,
+            "281": 849621504.0,
+            "282": 849621504.0,
+            "283": 849621504.0,
+            "284": 849621504.0,
+            "285": 849621504.0,
+            "286": 849621504.0,
+            "287": 849621504.0,
+            "288": 849621504.0,
+            "289": 849621504.0,
+            "290": 849621504.0,
+            "291": 849621504.0,
+            "292": 849621504.0,
+            "293": 849621504.0,
+            "294": 849621504.0,
+            "295": 849621504.0,
+            "296": 849621504.0,
+            "297": 849621504.0,
+            "298": 849621504.0,
+            "299": 849621504.0,
+            "300": 849621504.0,
+            "301": 849621504.0,
+            "302": 849621504.0,
+            "303": 849621504.0,
+            "304": 849621504.0,
+            "305": 849621504.0,
+            "306": 849621504.0,
+            "307": 849621504.0,
+            "308": 849621504.0,
+            "309": 849621504.0,
+            "310": 849621504.0,
+            "311": 849621504.0,
+            "312": 849621504.0,
+            "313": 849621504.0,
+            "314": 849621504.0,
+            "315": 849621504.0,
+            "316": 849621504.0,
+            "317": 849621504.0,
+            "318": 849621504.0,
+            "319": 849621504.0,
+            "320": 849621504.0,
+            "321": 849621504.0,
+            "322": 849621504.0,
+            "323": 849621504.0,
+            "324": 849621504.0,
+            "325": 849621504.0,
+            "326": 849621504.0,
+            "327": 849621504.0,
+            "328": 849621504.0,
+            "329": 849621504.0,
+            "330": 849621504.0,
+            "331": 849621504.0,
+            "332": 849621504.0,
+            "333": 849621504.0,
+            "334": 849621504.0,
+            "335": 849621504.0,
+            "336": 849621504.0,
+            "337": 849621504.0,
+            "338": 849621504.0,
+            "339": 849621504.0,
+            "340": 849621504.0,
+            "341": 849621504.0,
+            "342": 849621504.0,
+            "343": 849621504.0,
+            "344": 849621504.0,
+            "345": 849621504.0,
+            "346": 849621504.0,
+            "347": 849621504.0,
+            "348": 849621504.0,
+            "349": 849621504.0,
+            "350": 849621504.0,
+            "351": 849621504.0,
+            "352": 849621504.0,
+            "353": 849621504.0,
+            "354": 849621504.0,
+            "355": 849621504.0,
+            "356": 849621504.0,
+            "357": 849621504.0,
+            "358": 849621504.0,
+            "359": 849621504.0,
+            "360": 849621504.0,
+            "361": 849621504.0,
+            "362": 849621504.0,
+            "363": 849621504.0,
+            "364": 849621504.0,
+            "365": 849621504.0,
+            "366": 849621504.0,
+            "367": 849621504.0,
+            "368": 849621504.0,
+            "369": 849621504.0,
+            "370": 849621504.0,
+            "371": 849621504.0,
+            "372": 849621504.0,
+            "373": 849621504.0,
+            "374": 849621504.0,
+            "375": 849621504.0,
+            "376": 849621504.0,
+            "377": 849621504.0,
+            "378": 849621504.0,
+            "379": 849621504.0,
+            "380": 849621504.0,
+            "381": 849621504.0,
+            "382": 849621504.0,
+            "383": 849621504.0,
+            "384": 849621504.0,
+            "385": 849621504.0,
+            "386": 849621504.0,
+            "387": 849621504.0,
+            "388": 849621504.0,
+            "389": 849621504.0,
+            "390": 849621504.0,
+            "391": 849621504.0,
+            "392": 849621504.0,
+            "393": 849621504.0,
+            "394": 849621504.0,
+            "395": 849621504.0,
+            "396": 849621504.0,
+            "397": 849621504.0,
+            "398": 849621504.0,
+            "399": 849621504.0,
+            "400": 849621504.0,
+            "401": 849621504.0,
+            "402": 849621504.0,
+            "403": 849621504.0,
+            "404": 849621504.0,
+            "405": 849621504.0,
+            "406": 849621504.0,
+            "407": 849621504.0,
+            "408": 849621504.0,
+            "409": 849621504.0,
+            "410": 849621504.0,
+            "411": 849621504.0,
+            "412": 849621504.0,
+            "413": 849621504.0,
+            "414": 849621504.0,
+            "415": 849621504.0,
+            "416": 849621504.0,
+            "417": 849621504.0,
+            "418": 849621504.0,
+            "419": 849621504.0,
+            "420": 849621504.0,
+            "421": 849621504.0,
+            "422": 849621504.0,
+            "423": 849621504.0,
+            "424": 849621504.0,
+            "425": 849621504.0,
+            "426": 849621504.0,
+            "427": 849621504.0,
+            "428": 849621504.0,
+            "429": 849621504.0,
+            "430": 849621504.0,
+            "431": 849621504.0,
+            "432": 849621504.0,
+            "433": 849621504.0,
+            "434": 849621504.0,
+            "435": 849621504.0,
+            "436": 849621504.0,
+            "437": 849621504.0,
+            "438": 849621504.0,
+            "439": 849621504.0,
+            "440": 849621504.0,
+            "441": 849621504.0,
+            "442": 849621504.0,
+            "443": 849621504.0,
+            "444": 849621504.0,
+            "445": 849621504.0,
+            "446": 849621504.0,
+            "447": 849621504.0,
+            "448": 849621504.0,
+            "449": 849621504.0,
+            "450": 849621504.0,
+            "451": 849621504.0,
+            "452": 849621504.0,
+            "453": 849621504.0,
+            "454": 849621504.0,
+            "455": 849621504.0,
+            "456": 849621504.0,
+            "457": 849621504.0,
+            "458": 849621504.0,
+            "459": 849621504.0,
+            "460": 849621504.0,
+            "461": 849621504.0,
+            "462": 849621504.0,
+            "463": 849621504.0,
+            "464": 849621504.0,
+            "465": 849621504.0,
+            "466": 849621504.0,
+            "467": 849621504.0,
+            "468": 849621504.0,
+            "469": 849621504.0,
+            "470": 849621504.0,
+            "471": 849621504.0,
+            "472": 849621504.0,
+            "473": 849621504.0,
+            "474": 849621504.0,
+            "475": 849621504.0,
+            "476": 849621504.0,
+            "477": 849621504.0,
+            "478": 849621504.0,
+            "479": 849621504.0,
+            "480": 849621504.0,
+            "481": 849621504.0,
+            "482": 849621504.0,
+            "483": 849621504.0,
+            "484": 849621504.0,
+            "485": 849621504.0,
+            "486": 849621504.0,
+            "487": 849621504.0,
+            "488": 849621504.0,
+            "489": 849621504.0,
+            "490": 849621504.0,
+            "491": 849621504.0,
+            "492": 849621504.0,
+            "493": 849621504.0,
+            "494": 849621504.0,
+            "495": 849621504.0,
+            "496": 849621504.0,
+            "497": 849621504.0,
+            "498": 849621504.0,
+            "499": 849621504.0,
+            "500": 849621504.0,
+            "501": 849621504.0,
+            "502": 849621504.0,
+            "503": 849621504.0,
+            "504": 849621504.0,
+            "505": 849621504.0,
+            "506": 849621504.0,
+            "507": 849621504.0,
+            "508": 849621504.0,
+            "509": 849621504.0,
+            "510": 849621504.0,
+            "511": 849621504.0,
+            "512": 849621504.0,
+            "513": 849621504.0,
+            "514": 849621504.0,
+            "515": 849621504.0,
+            "516": 849621504.0,
+            "517": 849621504.0,
+            "518": 849621504.0,
+            "519": 849621504.0,
+            "520": 849621504.0,
+            "521": 849621504.0,
+            "522": 849621504.0,
+            "523": 849621504.0,
+            "524": 849621504.0,
+            "525": 849621504.0,
+            "526": 849621504.0,
+            "527": 849621504.0,
+            "528": 849621504.0,
+            "529": 849621504.0,
+            "530": 849621504.0,
+            "531": 849621504.0,
+            "532": 849621504.0,
+            "533": 849621504.0,
+            "534": 849621504.0,
+            "535": 849621504.0,
+            "536": 849621504.0,
+            "537": 849621504.0,
+            "538": 849621504.0,
+            "539": 849621504.0,
+            "540": 849621504.0,
+            "541": 849621504.0,
+            "542": 849621504.0,
+            "543": 849621504.0,
+            "544": 849621504.0,
+            "545": 849621504.0,
+            "546": 849621504.0,
+            "547": 849621504.0,
+            "548": 849621504.0,
+            "549": 849621504.0,
+            "550": 849621504.0,
+            "551": 849621504.0,
+            "552": 849621504.0,
+            "553": 849621504.0,
+            "554": 849621504.0,
+            "555": 849621504.0,
+            "556": 849621504.0,
+            "557": 849621504.0,
+            "558": 849621504.0,
+            "559": 849621504.0,
+            "560": 849621504.0,
+            "561": 849621504.0,
+            "562": 849621504.0,
+            "563": 849621504.0,
+            "564": 849621504.0,
+            "565": 849621504.0,
+            "566": 849621504.0,
+            "567": 849621504.0,
+            "568": 849621504.0,
+            "569": 849621504.0,
+            "570": 849621504.0,
+            "571": 849621504.0,
+            "572": 849621504.0,
+            "573": 849621504.0,
+            "574": 849621504.0,
+            "575": 849621504.0,
+            "576": 849621504.0,
+            "577": 849621504.0,
+            "578": 849621504.0,
+            "579": 849621504.0,
+            "580": 849621504.0,
+            "581": 849621504.0,
+            "582": 849621504.0,
+            "583": 849621504.0,
+            "584": 849621504.0,
+            "585": 849621504.0,
+            "586": 849621504.0,
+            "587": 849621504.0,
+            "588": 849621504.0,
+            "589": 849621504.0,
+            "590": 849621504.0,
+            "591": 849621504.0,
+            "592": 849621504.0,
+            "593": 849621504.0,
+            "594": 849621504.0,
+            "595": 849621504.0,
+            "596": 849621504.0,
+            "597": 849621504.0,
+            "598": 849621504.0,
+            "599": 849621504.0,
+            "600": 849621504.0,
+            "601": 849621504.0,
+            "602": 849621504.0,
+            "603": 849621504.0,
+            "604": 849621504.0,
+            "605": 849621504.0,
+            "606": 849621504.0,
+            "607": 849621504.0,
+            "608": 849621504.0,
+            "609": 849621504.0,
+            "610": 849621504.0,
+            "611": 849621504.0,
+            "612": 849621504.0,
+            "613": 849621504.0,
+            "614": 849621504.0,
+            "615": 849621504.0,
+            "616": 849621504.0,
+            "617": 849621504.0,
+            "618": 849621504.0,
+            "619": 849621504.0,
+            "620": 849621504.0,
+            "621": 849621504.0,
+            "622": 849621504.0,
+            "623": 849621504.0,
+            "624": 849621504.0,
+            "625": 849621504.0,
+            "626": 849621504.0,
+            "627": 849621504.0,
+            "628": 849621504.0,
+            "629": 849621504.0,
+            "630": 849621504.0,
+            "631": 849621504.0,
+            "632": 849621504.0,
+            "633": 849621504.0,
+            "634": 849621504.0,
+            "635": 849621504.0,
+            "636": 849621504.0,
+            "637": 849621504.0,
+            "638": 849621504.0,
+            "639": 849621504.0,
+            "640": 849621504.0,
+            "641": 849621504.0,
+            "642": 849621504.0,
+            "643": 849621504.0,
+            "644": 849621504.0,
+            "645": 849621504.0,
+            "646": 849621504.0,
+            "647": 849621504.0,
+            "648": 849621504.0,
+            "649": 849621504.0,
+            "650": 849621504.0,
+            "651": 849621504.0,
+            "652": 849621504.0,
+            "653": 849621504.0,
+            "654": 849621504.0,
+            "655": 849621504.0,
+            "656": 849621504.0,
+            "657": 849621504.0,
+            "658": 849621504.0,
+            "659": 849621504.0,
+            "660": 849621504.0,
+            "661": 849621504.0,
+            "662": 849621504.0,
+            "663": 849621504.0,
+            "664": 849621504.0,
+            "665": 849621504.0,
+            "666": 849621504.0,
+            "667": 849621504.0,
+            "668": 849621504.0,
+            "669": 849621504.0,
+            "670": 849621504.0,
+            "671": 849621504.0,
+            "672": 849621504.0,
+            "673": 849621504.0,
+            "674": 849621504.0,
+            "675": 849621504.0,
+            "676": 849621504.0,
+            "677": 849621504.0,
+            "678": 849621504.0,
+            "679": 849621504.0,
+            "680": 849621504.0,
+            "681": 849621504.0,
+            "682": 849621504.0,
+            "683": 849621504.0,
+            "684": 849621504.0,
+            "685": 849621504.0,
+            "686": 849621504.0,
+            "687": 849621504.0,
+            "688": 849621504.0,
+            "689": 849621504.0,
+            "690": 849621504.0,
+            "691": 849621504.0,
+            "692": 849621504.0,
+            "693": 849621504.0,
+            "694": 849621504.0,
+            "695": 849621504.0,
+            "696": 849621504.0,
+            "697": 849621504.0,
+            "698": 849621504.0,
+            "699": 849621504.0,
+            "700": 849621504.0,
+            "701": 849621504.0,
+            "702": 849621504.0,
+            "703": 849621504.0,
+            "704": 849621504.0,
+            "705": 849621504.0,
+            "706": 849621504.0,
+            "707": 849621504.0,
+            "708": 849621504.0,
+            "709": 849621504.0,
+            "710": 849621504.0,
+            "711": 849621504.0,
+            "712": 849621504.0,
+            "713": 849621504.0,
+            "714": 849621504.0,
+            "715": 849621504.0,
+            "716": 849621504.0,
+            "717": 849621504.0,
+            "718": 849621504.0,
+            "719": 849621504.0,
+            "720": 849621504.0,
+            "721": 849621504.0,
+            "722": 849621504.0,
+            "723": 849621504.0,
+            "724": 849621504.0,
+            "725": 849621504.0,
+            "726": 849621504.0,
+            "727": 849621504.0,
+            "728": 849621504.0,
+            "729": 849621504.0,
+            "730": 849621504.0,
+            "731": 849621504.0,
+            "732": 849621504.0,
+            "733": 849621504.0,
+            "734": 849621504.0,
+            "735": 849621504.0,
+            "736": 849621504.0,
+            "737": 849621504.0,
+            "738": 849621504.0,
+            "739": 849621504.0,
+            "740": 849621504.0,
+            "741": 849621504.0,
+            "742": 849621504.0,
+            "743": 849621504.0,
+            "744": 849621504.0,
+            "745": 849621504.0,
+            "746": 849621504.0,
+            "747": 849621504.0,
+            "748": 849621504.0,
+            "749": 849621504.0,
+            "750": 849621504.0,
+            "751": 849621504.0,
+            "752": 849621504.0,
+            "753": 849621504.0,
+            "754": 849621504.0,
+            "755": 849621504.0,
+            "756": 849621504.0,
+            "757": 849621504.0,
+            "758": 849621504.0,
+            "759": 849621504.0,
+            "760": 849621504.0,
+            "761": 849621504.0,
+            "762": 849621504.0,
+            "763": 849621504.0,
+            "764": 849621504.0,
+            "765": 849621504.0,
+            "766": 849621504.0,
+            "767": 849621504.0,
+            "768": 849621504.0,
+            "769": 849621504.0,
+            "770": 849621504.0,
+            "771": 849621504.0,
+            "772": 849621504.0,
+            "773": 849621504.0,
+            "774": 849621504.0,
+            "775": 849621504.0,
+            "776": 849621504.0,
+            "777": 849621504.0,
+            "778": 849621504.0,
+            "779": 849621504.0,
+            "780": 849621504.0,
+            "781": 849621504.0,
+            "782": 849621504.0,
+            "783": 849621504.0,
+            "784": 849621504.0,
+            "785": 849621504.0,
+            "786": 849621504.0,
+            "787": 849621504.0,
+            "788": 849621504.0,
+            "789": 849621504.0,
+            "790": 849621504.0,
+            "791": 849621504.0,
+            "792": 849621504.0,
+            "793": 849621504.0,
+            "794": 849621504.0,
+            "795": 849621504.0,
+            "796": 849621504.0,
+            "797": 849621504.0,
+            "798": 849621504.0,
+            "799": 849621504.0,
+            "800": 849621504.0,
+            "801": 849621504.0,
+            "802": 849621504.0,
+            "803": 849621504.0,
+            "804": 849621504.0,
+            "805": 849621504.0,
+            "806": 849621504.0,
+            "807": 849621504.0,
+            "808": 849621504.0,
+            "809": 849621504.0,
+            "810": 849621504.0,
+            "811": 849621504.0,
+            "812": 849621504.0,
+            "813": 849621504.0,
+            "814": 849621504.0,
+            "815": 849621504.0,
+            "816": 849621504.0,
+            "817": 849621504.0,
+            "818": 849621504.0,
+            "819": 849621504.0,
+            "820": 849621504.0,
+            "821": 849621504.0,
+            "822": 849621504.0,
+            "823": 849621504.0,
+            "824": 849621504.0,
+            "825": 849621504.0,
+            "826": 849621504.0,
+            "827": 849621504.0,
+            "828": 849621504.0,
+            "829": 849621504.0,
+            "830": 849621504.0,
+            "831": 849621504.0,
+            "832": 849621504.0,
+            "833": 849621504.0,
+            "834": 849621504.0,
+            "835": 849621504.0,
+            "836": 849621504.0,
+            "837": 849621504.0,
+            "838": 849621504.0,
+            "839": 849621504.0,
+            "840": 849621504.0,
+            "841": 849621504.0,
+            "842": 849621504.0,
+            "843": 849621504.0,
+            "844": 849621504.0,
+            "845": 849621504.0,
+            "846": 849621504.0,
+            "847": 849621504.0,
+            "848": 849621504.0,
+            "849": 849621504.0,
+            "850": 849621504.0,
+            "851": 849621504.0,
+            "852": 849621504.0,
+            "853": 849621504.0,
+            "854": 849621504.0,
+            "855": 849621504.0,
+            "856": 849621504.0,
+            "857": 849621504.0,
+            "858": 849621504.0,
+            "859": 849621504.0,
+            "860": 849621504.0,
+            "861": 849621504.0,
+            "862": 849621504.0,
+            "863": 849621504.0,
+            "864": 849621504.0,
+            "865": 849621504.0,
+            "866": 849621504.0,
+            "867": 849621504.0,
+            "868": 849621504.0,
+            "869": 849621504.0,
+            "870": 849621504.0,
+            "871": 849621504.0,
+            "872": 849621504.0,
+            "873": 849621504.0,
+            "874": 849621504.0,
+            "875": 849621504.0,
+            "876": 849621504.0,
+            "877": 849621504.0,
+            "878": 849621504.0,
+            "879": 849621504.0,
+            "880": 849621504.0,
+            "881": 849621504.0,
+            "882": 849621504.0,
+            "883": 849621504.0,
+            "884": 849621504.0,
+            "885": 849621504.0,
+            "886": 849621504.0,
+            "887": 849621504.0,
+            "888": 849621504.0,
+            "889": 849621504.0,
+            "890": 849621504.0,
+            "891": 849621504.0,
+            "892": 849621504.0,
+            "893": 849621504.0,
+            "894": 849621504.0,
+            "895": 849621504.0,
+            "896": 849621504.0,
+            "897": 849621504.0,
+            "898": 849621504.0,
+            "899": 849621504.0,
+            "900": 849621504.0,
+            "901": 849621504.0,
+            "902": 849621504.0,
+            "903": 849621504.0,
+            "904": 849621504.0,
+            "905": 849621504.0,
+            "906": 849621504.0,
+            "907": 849621504.0,
+            "908": 849621504.0,
+            "909": 849621504.0,
+            "910": 849621504.0,
+            "911": 849621504.0,
+            "912": 849621504.0,
+            "913": 849621504.0,
+            "914": 849621504.0,
+            "915": 849621504.0,
+            "916": 849621504.0,
+            "917": 849621504.0,
+            "918": 849621504.0,
+            "919": 849621504.0,
+            "920": 849621504.0,
+            "921": 849621504.0,
+            "922": 849621504.0,
+            "923": 849621504.0,
+            "924": 849621504.0,
+            "925": 849621504.0,
+            "926": 849621504.0,
+            "927": 849621504.0,
+            "928": 849621504.0,
+            "929": 849621504.0,
+            "930": 849621504.0,
+            "931": 849621504.0,
+            "932": 849621504.0,
+            "933": 849621504.0,
+            "934": 849621504.0,
+            "935": 849621504.0,
+            "936": 849621504.0,
+            "937": 849621504.0,
+            "938": 849621504.0,
+            "939": 849621504.0,
+            "940": 849621504.0,
+            "941": 849621504.0,
+            "942": 849621504.0,
+            "943": 849621504.0,
+            "944": 849621504.0,
+            "945": 849621504.0,
+            "946": 849621504.0,
+            "947": 849621504.0,
+            "948": 849621504.0,
+            "949": 849621504.0,
+            "950": 849621504.0,
+            "951": 849621504.0,
+            "952": 849621504.0,
+            "953": 849621504.0,
+            "954": 849621504.0,
+            "955": 849621504.0,
+            "956": 849621504.0,
+            "957": 849621504.0,
+            "958": 849621504.0,
+            "959": 849621504.0,
+            "960": 849621504.0,
+            "961": 849621504.0,
+            "962": 849621504.0,
+            "963": 849621504.0,
+            "964": 849621504.0,
+            "965": 849621504.0,
+            "966": 849621504.0,
+            "967": 849621504.0,
+            "968": 849621504.0,
+            "969": 849621504.0,
+            "970": 849621504.0,
+            "971": 849621504.0,
+            "972": 849621504.0,
+            "973": 849621504.0,
+            "974": 849621504.0,
+            "975": 849621504.0,
+            "976": 849621504.0,
+            "977": 849621504.0,
+            "978": 849621504.0,
+            "979": 849621504.0,
+            "980": 849621504.0,
+            "981": 849621504.0,
+            "982": 849621504.0,
+            "983": 849621504.0,
+            "984": 849621504.0,
+            "985": 849621504.0,
+            "986": 849621504.0,
+            "987": 849621504.0,
+            "988": 849621504.0,
+            "989": 849621504.0,
+            "990": 849621504.0,
+            "991": 849621504.0,
+            "992": 849621504.0,
+            "993": 849621504.0,
+            "994": 849621504.0,
+            "995": 849621504.0,
+            "996": 849621504.0,
+            "997": 849621504.0,
+            "998": 849621504.0,
+            "999": 849621504.0,
+            "1000": 849621504.0,
+            "1001": 849621504.0,
+            "1002": 849621504.0,
+            "1003": 849621504.0,
+            "1004": 849621504.0,
+            "1005": 849621504.0,
+            "1006": 849621504.0,
+            "1007": 849621504.0,
+            "1008": 849621504.0,
+            "1009": 849621504.0,
+            "1010": 849621504.0,
+            "1011": 849621504.0,
+            "1012": 849621504.0,
+            "1013": 849621504.0,
+            "1014": 849621504.0,
+            "1015": 849621504.0,
+            "1016": 849621504.0,
+            "1017": 849621504.0,
+            "1018": 849621504.0,
+            "1019": 849621504.0,
+            "1020": 849621504.0,
+            "1021": 849621504.0,
+            "1022": 849621504.0,
+            "1023": 849621504.0,
+            "1024": 849621504.0,
+            "1025": 849621504.0,
+            "1026": 849621504.0,
+            "1027": 849621504.0,
+            "1028": 849621504.0,
+            "1029": 849621504.0,
+            "1030": 849621504.0,
+            "1031": 849621504.0,
+            "1032": 849621504.0,
+            "1033": 849621504.0,
+            "1034": 849621504.0,
+            "1035": 849621504.0,
+            "1036": 849621504.0,
+            "1037": 849621504.0,
+            "1038": 849621504.0,
+            "1039": 849621504.0,
+            "1040": 849621504.0,
+            "1041": 849621504.0,
+            "1042": 849621504.0,
+            "1043": 849621504.0,
+            "1044": 849621504.0,
+            "1045": 849621504.0,
+            "1046": 849621504.0,
+            "1047": 849621504.0,
+            "1048": 849621504.0,
+            "1049": 849621504.0,
+            "1050": 849621504.0,
+            "1051": 849621504.0,
+            "1052": 849621504.0,
+            "1053": 849621504.0,
+            "1054": 849621504.0,
+            "1055": 849621504.0,
+            "1056": 849621504.0,
+            "1057": 849621504.0,
+            "1058": 849621504.0,
+            "1059": 849621504.0,
+            "1060": 849621504.0,
+            "1061": 849621504.0,
+            "1062": 849621504.0,
+            "1063": 849621504.0,
+            "1064": 849621504.0,
+            "1065": 849621504.0,
+            "1066": 849621504.0,
+            "1067": 849621504.0,
+            "1068": 849621504.0,
+            "1069": 849621504.0,
+            "1070": 849621504.0,
+            "1071": 849621504.0,
+            "1072": 849621504.0,
+            "1073": 849621504.0,
+            "1074": 849621504.0,
+            "1075": 849621504.0,
+            "1076": 849621504.0,
+            "1077": 849621504.0,
+            "1078": 849621504.0,
+            "1079": 849621504.0,
+            "1080": 849621504.0,
+            "1081": 849621504.0,
+            "1082": 849621504.0,
+            "1083": 849621504.0,
+            "1084": 849621504.0,
+            "1085": 849621504.0,
+            "1086": 849621504.0,
+            "1087": 849621504.0,
+            "1088": 849621504.0,
+            "1089": 849621504.0,
+            "1090": 849621504.0,
+            "1091": 849621504.0,
+            "1092": 849621504.0,
+            "1093": 849621504.0,
+            "1094": 849621504.0,
+            "1095": 849621504.0,
+            "1096": 849621504.0,
+            "1097": 849621504.0,
+            "1098": 849621504.0,
+            "1099": 849621504.0,
+            "1100": 849621504.0,
+            "1101": 849621504.0,
+            "1102": 849621504.0,
+            "1103": 849621504.0,
+            "1104": 849621504.0,
+            "1105": 849621504.0,
+            "1106": 849621504.0,
+            "1107": 849621504.0,
+            "1108": 849621504.0,
+            "1109": 849621504.0,
+            "1110": 849621504.0,
+            "1111": 849621504.0,
+            "1112": 849621504.0,
+            "1113": 849621504.0,
+            "1114": 849621504.0,
+            "1115": 849621504.0,
+            "1116": 849621504.0,
+            "1117": 849621504.0,
+            "1118": 849621504.0,
+            "1119": 849621504.0,
+            "1120": 849621504.0,
+            "1121": 849621504.0,
+            "1122": 849621504.0,
+            "1123": 849621504.0,
+            "1124": 849621504.0,
+            "1125": 849621504.0,
+            "1126": 849621504.0,
+            "1127": 849621504.0,
+            "1128": 849621504.0,
+            "1129": 849621504.0,
+            "1130": 849621504.0,
+            "1131": 849621504.0,
+            "1132": 849621504.0,
+            "1133": 849621504.0,
+            "1134": 849621504.0,
+            "1135": 849621504.0,
+            "1136": 849621504.0,
+            "1137": 849621504.0,
+            "1138": 849621504.0,
+            "1139": 849621504.0,
+            "1140": 849621504.0,
+            "1141": 849621504.0,
+            "1142": 849621504.0,
+            "1143": 849621504.0,
+            "1144": 849621504.0,
+            "1145": 849621504.0,
+            "1146": 849621504.0,
+            "1147": 849621504.0,
+            "1148": 849621504.0,
+            "1149": 849621504.0,
+            "1150": 849621504.0,
+            "1151": 849621504.0,
+            "1152": 849621504.0,
+            "1153": 849621504.0,
+            "1154": 849621504.0,
+            "1155": 849621504.0,
+            "1156": 849621504.0,
+            "1157": 849621504.0,
+            "1158": 849621504.0,
+            "1159": 849621504.0,
+            "1160": 849621504.0,
+            "1161": 849621504.0,
+            "1162": 849621504.0,
+            "1163": 849621504.0,
+            "1164": 849621504.0,
+            "1165": 849621504.0,
+            "1166": 849621504.0,
+            "1167": 849621504.0,
+            "1168": 849621504.0,
+            "1169": 849621504.0,
+            "1170": 849621504.0,
+            "1171": 849621504.0,
+            "1172": 849621504.0,
+            "1173": 849621504.0,
+            "1174": 849621504.0,
+            "1175": 849621504.0,
+            "1176": 849621504.0,
+            "1177": 849621504.0,
+            "1178": 849621504.0,
+            "1179": 849621504.0,
+            "1180": 849621504.0,
+            "1181": 849621504.0,
+            "1182": 849621504.0,
+            "1183": 849621504.0,
+            "1184": 849621504.0,
+            "1185": 849621504.0,
+            "1186": 849621504.0,
+            "1187": 849621504.0,
+            "1188": 849621504.0,
+            "1189": 849621504.0,
+            "1190": 849621504.0,
+            "1191": 849621504.0,
+            "1192": 849621504.0,
+            "1193": 849621504.0,
+            "1194": 849621504.0,
+            "1195": 849621504.0,
+            "1196": 849621504.0,
+            "1197": 849621504.0,
+            "1198": 849621504.0,
+            "1199": 849621504.0,
+            "1200": 849621504.0,
+            "1201": 849621504.0,
+            "1202": 849621504.0,
+            "1203": 849621504.0,
+            "1204": 849621504.0,
+            "1205": 849621504.0,
+            "1206": 849621504.0,
+            "1207": 849621504.0,
+            "1208": 849621504.0,
+            "1209": 849621504.0,
+            "1210": 849621504.0,
+            "1211": 849621504.0,
+            "1212": 849621504.0,
+            "1213": 849621504.0,
+            "1214": 849621504.0,
+            "1215": 849621504.0,
+            "1216": 849621504.0,
+            "1217": 849621504.0,
+            "1218": 849621504.0,
+            "1219": 849621504.0,
+            "1220": 849621504.0,
+            "1221": 849621504.0,
+            "1222": 849621504.0,
+            "1223": 849621504.0,
+            "1224": 849621504.0,
+            "1225": 849621504.0,
+            "1226": 849621504.0,
+            "1227": 849621504.0,
+            "1228": 849621504.0,
+            "1229": 849621504.0,
+            "1230": 849621504.0,
+            "1231": 849621504.0,
+            "1232": 849621504.0,
+            "1233": 849621504.0,
+            "1234": 849621504.0,
+            "1235": 849621504.0,
+            "1236": 849621504.0,
+            "1237": 849621504.0,
+            "1238": 849621504.0,
+            "1239": 849621504.0,
+            "1240": 849621504.0,
+            "1241": 849621504.0,
+            "1242": 849621504.0,
+            "1243": 849621504.0,
+            "1244": 849621504.0,
+            "1245": 849621504.0,
+            "1246": 849621504.0,
+            "1247": 849621504.0,
+            "1248": 849621504.0,
+            "1249": 849621504.0,
+            "1250": 849621504.0,
+            "1251": 849621504.0,
+            "1252": 849621504.0,
+            "1253": 849621504.0,
+            "1254": 849621504.0,
+            "1255": 849621504.0,
+            "1256": 849621504.0,
+            "1257": 849621504.0,
+            "1258": 849621504.0,
+            "1259": 849621504.0,
+            "1260": 849621504.0,
+            "1261": 849621504.0,
+            "1262": 849621504.0,
+            "1263": 849621504.0,
+            "1264": 849621504.0,
+            "1265": 849621504.0,
+            "1266": 849621504.0,
+            "1267": 849621504.0,
+            "1268": 849621504.0,
+            "1269": 849621504.0,
+            "1270": 849621504.0,
+            "1271": 849621504.0,
+            "1272": 849621504.0,
+            "1273": 849621504.0,
+            "1274": 849621504.0,
+            "1275": 849621504.0,
+            "1276": 849621504.0,
+            "1277": 849621504.0,
+            "1278": 849621504.0,
+            "1279": 849621504.0,
+            "1280": 849621504.0,
+            "1281": 849621504.0,
+            "1282": 849621504.0,
+            "1283": 849621504.0,
+            "1284": 849621504.0,
+            "1285": 849621504.0,
+            "1286": 849621504.0,
+            "1287": 849621504.0,
+            "1288": 849621504.0,
+            "1289": 849621504.0,
+            "1290": 849621504.0,
+            "1291": 849621504.0,
+            "1292": 849621504.0,
+            "1293": 849621504.0,
+            "1294": 849621504.0,
+            "1295": 849621504.0,
+            "1296": 849621504.0,
+            "1297": 849621504.0,
+            "1298": 849621504.0,
+            "1299": 849621504.0,
+            "1300": 849621504.0,
+            "1301": 849621504.0,
+            "1302": 849621504.0,
+            "1303": 849621504.0,
+            "1304": 849621504.0,
+            "1305": 849621504.0,
+            "1306": 849621504.0,
+            "1307": 849621504.0,
+            "1308": 849621504.0,
+            "1309": 849621504.0,
+            "1310": 849621504.0,
+            "1311": 849621504.0,
+            "1312": 849621504.0,
+            "1313": 849621504.0,
+            "1314": 849621504.0,
+            "1315": 849621504.0,
+            "1316": 849621504.0,
+            "1317": 849621504.0,
+            "1318": 849621504.0,
+            "1319": 849621504.0,
+            "1320": 849621504.0,
+            "1321": 849621504.0,
+            "1322": 849621504.0,
+            "1323": 849621504.0,
+            "1324": 849621504.0,
+            "1325": 849621504.0,
+            "1326": 849621504.0,
+            "1327": 849621504.0,
+            "1328": 849621504.0,
+            "1329": 849621504.0,
+            "1330": 849621504.0,
+            "1331": 849621504.0,
+            "1332": 849621504.0,
+            "1333": 849621504.0,
+            "1334": 849621504.0,
+            "1335": 849621504.0,
+            "1336": 849621504.0,
+            "1337": 849621504.0,
+            "1338": 849621504.0,
+            "1339": 849621504.0,
+            "1340": 849621504.0,
+            "1341": 849621504.0,
+            "1342": 849621504.0,
+            "1343": 849621504.0,
+            "1344": 849621504.0,
+            "1345": 849621504.0,
+            "1346": 849621504.0,
+            "1347": 849621504.0,
+            "1348": 849621504.0,
+            "1349": 849621504.0,
+            "1350": 849621504.0,
+            "1351": 849621504.0,
+            "1352": 849621504.0,
+            "1353": 849621504.0,
+            "1354": 849621504.0,
+            "1355": 849621504.0,
+            "1356": 849621504.0,
+            "1357": 849621504.0,
+            "1358": 849621504.0,
+            "1359": 849621504.0,
+            "1360": 849621504.0,
+            "1361": 849621504.0,
+            "1362": 849621504.0,
+            "1363": 849621504.0,
+            "1364": 849621504.0,
+            "1365": 849621504.0,
+            "1366": 849621504.0,
+            "1367": 849621504.0,
+            "1368": 849621504.0,
+            "1369": 849621504.0,
+            "1370": 849621504.0,
+            "1371": 849621504.0,
+            "1372": 849621504.0,
+            "1373": 849621504.0,
+            "1374": 849621504.0,
+            "1375": 849621504.0,
+            "1376": 849621504.0,
+            "1377": 849621504.0,
+            "1378": 849621504.0,
+            "1379": 849621504.0,
+            "1380": 849621504.0,
+            "1381": 849621504.0,
+            "1382": 849621504.0,
+            "1383": 849621504.0,
+            "1384": 849621504.0,
+            "1385": 849621504.0,
+            "1386": 849621504.0,
+            "1387": 849621504.0,
+            "1388": 849621504.0,
+            "1389": 849621504.0,
+            "1390": 849621504.0,
+            "1391": 849621504.0,
+            "1392": 849621504.0,
+            "1393": 849621504.0,
+            "1394": 849621504.0,
+            "1395": 849621504.0,
+            "1396": 849621504.0,
+            "1397": 849621504.0,
+            "1398": 849621504.0,
+            "1399": 849621504.0,
+            "1400": 849621504.0,
+            "1401": 849621504.0,
+            "1402": 849621504.0,
+            "1403": 849621504.0,
+            "1404": 849621504.0,
+            "1405": 849621504.0,
+            "1406": 849621504.0,
+            "1407": 849621504.0,
+            "1408": 849621504.0,
+            "1409": 849621504.0,
+            "1410": 849621504.0,
+            "1411": 849621504.0,
+            "1412": 849621504.0,
+            "1413": 849621504.0,
+            "1414": 849621504.0,
+            "1415": 849621504.0,
+            "1416": 849621504.0,
+            "1417": 849621504.0,
+            "1418": 849621504.0,
+            "1419": 849621504.0,
+            "1420": 849621504.0,
+            "1421": 849621504.0,
+            "1422": 849621504.0,
+            "1423": 849621504.0,
+            "1424": 849621504.0,
+            "1425": 849621504.0,
+            "1426": 849621504.0,
+            "1427": 849621504.0,
+            "1428": 849621504.0,
+            "1429": 849621504.0,
+            "1430": 849621504.0,
+            "1431": 849621504.0,
+            "1432": 849621504.0,
+            "1433": 849621504.0,
+            "1434": 849621504.0,
+            "1435": 849621504.0,
+            "1436": 849621504.0,
+            "1437": 849621504.0,
+            "1438": 849621504.0,
+            "1439": 849621504.0,
+            "1440": 849621504.0,
+            "1441": 849621504.0,
+            "1442": 849621504.0,
+            "1443": 849621504.0,
+            "1444": 849621504.0,
+            "1445": 849621504.0,
+            "1446": 849621504.0,
+            "1447": 849621504.0,
+            "1448": 849621504.0,
+            "1449": 849621504.0,
+            "1450": 849621504.0,
+            "1451": 849621504.0,
+            "1452": 849621504.0,
+            "1453": 849621504.0,
+            "1454": 849621504.0,
+            "1455": 849621504.0,
+            "1456": 849621504.0,
+            "1457": 849621504.0,
+            "1458": 849621504.0,
+            "1459": 849621504.0,
+            "1460": 849621504.0,
+            "1461": 849621504.0,
+            "1462": 849621504.0,
+            "1463": 849621504.0,
+            "1464": 849621504.0,
+            "1465": 849621504.0,
+            "1466": 849621504.0,
+            "1467": 849621504.0,
+            "1468": 849621504.0,
+            "1469": 849621504.0,
+            "1470": 849621504.0,
+            "1471": 849621504.0,
+            "1472": 849621504.0,
+            "1473": 849621504.0,
+            "1474": 849621504.0,
+            "1475": 849621504.0,
+            "1476": 849621504.0,
+            "1477": 849621504.0,
+            "1478": 849621504.0,
+            "1479": 849621504.0,
+            "1480": 849621504.0,
+            "1481": 849621504.0,
+            "1482": 849621504.0,
+            "1483": 849621504.0,
+            "1484": 849621504.0,
+            "1485": 849621504.0,
+            "1486": 849621504.0,
+            "1487": 849621504.0,
+            "1488": 849621504.0,
+            "1489": 849621504.0,
+            "1490": 849621504.0,
+            "1491": 849621504.0,
+            "1492": 849621504.0,
+            "1493": 849621504.0,
+            "1494": 849621504.0,
+            "1495": 849621504.0,
+            "1496": 849621504.0,
+            "1497": 849621504.0,
+            "1498": 849621504.0,
+            "1499": 849621504.0,
+            "1500": 849621504.0,
+            "1501": 849621504.0,
+            "1502": 849621504.0,
+            "1503": 849621504.0,
+            "1504": 849621504.0,
+            "1505": 849621504.0,
+            "1506": 849621504.0,
+            "1507": 849621504.0,
+            "1508": 849621504.0,
+            "1509": 849621504.0,
+            "1510": 849621504.0,
+            "1511": 849621504.0,
+            "1512": 849621504.0,
+            "1513": 849621504.0,
+            "1514": 849621504.0,
+            "1515": 849621504.0,
+            "1516": 849621504.0,
+            "1517": 849621504.0,
+            "1518": 849621504.0,
+            "1519": 849621504.0,
+            "1520": 849621504.0,
+            "1521": 849621504.0,
+            "1522": 849621504.0,
+            "1523": 849621504.0,
+            "1524": 849621504.0,
+            "1525": 849621504.0,
+            "1526": 849621504.0,
+            "1527": 849621504.0,
+            "1528": 849621504.0,
+            "1529": 849621504.0,
+            "1530": 849621504.0,
+            "1531": 849621504.0,
+            "1532": 849621504.0,
+            "1533": 849621504.0,
+            "1534": 849621504.0,
+            "1535": 849621504.0,
+            "1536": 849621504.0,
+            "1537": 849621504.0,
+            "1538": 849621504.0,
+            "1539": 849621504.0,
+            "1540": 849621504.0,
+            "1541": 849621504.0,
+            "1542": 849621504.0,
+            "1543": 849621504.0,
+            "1544": 849621504.0,
+            "1545": 849621504.0,
+            "1546": 849621504.0,
+            "1547": 849621504.0,
+            "1548": 849621504.0,
+            "1549": 849621504.0,
+            "1550": 849621504.0,
+            "1551": 849621504.0,
+            "1552": 849621504.0,
+            "1553": 849621504.0,
+            "1554": 849621504.0,
+            "1555": 849621504.0,
+            "1556": 849621504.0,
+            "1557": 849621504.0,
+            "1558": 849621504.0,
+            "1559": 849621504.0,
+            "1560": 849621504.0,
+            "1561": 849621504.0,
+            "1562": 849621504.0,
+            "1563": 849621504.0,
+            "1564": 849621504.0,
+            "1565": 849621504.0,
+            "1566": 849621504.0,
+            "1567": 849621504.0,
+            "1568": 849621504.0,
+            "1569": 849621504.0,
+            "1570": 849621504.0,
+            "1571": 849621504.0,
+            "1572": 849621504.0,
+            "1573": 849621504.0,
+            "1574": 849621504.0,
+            "1575": 849621504.0,
+            "1576": 849621504.0,
+            "1577": 849621504.0,
+            "1578": 849621504.0,
+            "1579": 849621504.0,
+            "1580": 849621504.0,
+            "1581": 849621504.0,
+            "1582": 849621504.0,
+            "1583": 849621504.0,
+            "1584": 849621504.0,
+            "1585": 849621504.0,
+            "1586": 849621504.0,
+            "1587": 849621504.0,
+            "1588": 849621504.0,
+            "1589": 849621504.0,
+            "1590": 849621504.0,
+            "1591": 849621504.0,
+            "1592": 849621504.0,
+            "1593": 849621504.0,
+            "1594": 849621504.0,
+            "1595": 849621504.0,
+            "1596": 849621504.0,
+            "1597": 849621504.0,
+            "1598": 849621504.0,
+            "1599": 849621504.0,
+            "1600": 849621504.0,
+            "1601": 849621504.0,
+            "1602": 849621504.0,
+            "1603": 849621504.0,
+            "1604": 849621504.0,
+            "1605": 849621504.0,
+            "1606": 849621504.0,
+            "1607": 849621504.0,
+            "1608": 849621504.0,
+            "1609": 849621504.0,
+            "1610": 849621504.0,
+            "1611": 849621504.0,
+            "1612": 849621504.0,
+            "1613": 849621504.0,
+            "1614": 849621504.0,
+            "1615": 849621504.0,
+            "1616": 849621504.0,
+            "1617": 849621504.0,
+            "1618": 849621504.0,
+            "1619": 849621504.0,
+            "1620": 849621504.0,
+            "1621": 849621504.0,
+            "1622": 849621504.0,
+            "1623": 849621504.0,
+            "1624": 849621504.0,
+            "1625": 849621504.0,
+            "1626": 849621504.0,
+            "1627": 849621504.0,
+            "1628": 849621504.0,
+            "1629": 849621504.0,
+            "1630": 849621504.0,
+            "1631": 849621504.0,
+            "1632": 849621504.0,
+            "1633": 849621504.0,
+            "1634": 849621504.0,
+            "1635": 849621504.0,
+            "1636": 849621504.0,
+            "1637": 849621504.0,
+            "1638": 849621504.0,
+            "1639": 849621504.0,
+            "1640": 849621504.0,
+            "1641": 849621504.0,
+            "1642": 849621504.0,
+            "1643": 849621504.0,
+            "1644": 849621504.0,
+            "1645": 849621504.0,
+            "1646": 849621504.0,
+            "1647": 849621504.0,
+            "1648": 849621504.0,
+            "1649": 849621504.0,
+            "1650": 849621504.0,
+            "1651": 849621504.0,
+            "1652": 849621504.0,
+            "1653": 849621504.0,
+            "1654": 849621504.0,
+            "1655": 849621504.0,
+            "1656": 849621504.0,
+            "1657": 849621504.0,
+            "1658": 849621504.0,
+            "1659": 849621504.0,
+            "1660": 849621504.0,
+            "1661": 849621504.0,
+            "1662": 849621504.0,
+            "1663": 849621504.0,
+            "1664": 849621504.0,
+            "1665": 849621504.0,
+            "1666": 849621504.0,
+            "1667": 849621504.0,
+            "1668": 849621504.0,
+            "1669": 849621504.0,
+            "1670": 849621504.0,
+            "1671": 849621504.0,
+            "1672": 849621504.0,
+            "1673": 849621504.0,
+            "1674": 849621504.0,
+            "1675": 849621504.0,
+            "1676": 849621504.0,
+            "1677": 849621504.0,
+            "1678": 849621504.0,
+            "1679": 849621504.0,
+            "1680": 849621504.0,
+            "1681": 849621504.0,
+            "1682": 849621504.0,
+            "1683": 849621504.0,
+            "1684": 849621504.0,
+            "1685": 849621504.0,
+            "1686": 849621504.0,
+            "1687": 849621504.0,
+            "1688": 849621504.0,
+            "1689": 849621504.0,
+            "1690": 849621504.0,
+            "1691": 849621504.0,
+            "1692": 849621504.0,
+            "1693": 849621504.0,
+            "1694": 849621504.0,
+            "1695": 849621504.0,
+            "1696": 849621504.0,
+            "1697": 849621504.0,
+            "1698": 849621504.0,
+            "1699": 849621504.0,
+            "1700": 849621504.0,
+            "1701": 849621504.0,
+            "1702": 849621504.0,
+            "1703": 849621504.0,
+            "1704": 849621504.0,
+            "1705": 849621504.0,
+            "1706": 849621504.0,
+            "1707": 849621504.0,
+            "1708": 849621504.0,
+            "1709": 849621504.0,
+            "1710": 849621504.0,
+            "1711": 849621504.0,
+            "1712": 849621504.0,
+            "1713": 849621504.0,
+            "1714": 849621504.0,
+            "1715": 849621504.0,
+            "1716": 849621504.0,
+            "1717": 849621504.0,
+            "1718": 849621504.0,
+            "1719": 849621504.0,
+            "1720": 849621504.0,
+            "1721": 849621504.0,
+            "1722": 849621504.0,
+            "1723": 849621504.0,
+            "1724": 849621504.0,
+            "1725": 849621504.0,
+            "1726": 849621504.0,
+            "1727": 849621504.0,
+            "1728": 849621504.0,
+            "1729": 849621504.0,
+            "1730": 849621504.0,
+            "1731": 849621504.0,
+            "1732": 849621504.0,
+            "1733": 849621504.0,
+            "1734": 849621504.0,
+            "1735": 849621504.0,
+            "1736": 849621504.0,
+            "1737": 849621504.0,
+            "1738": 849621504.0,
+            "1739": 849621504.0,
+            "1740": 849621504.0,
+            "1741": 849621504.0,
+            "1742": 849621504.0,
+            "1743": 849621504.0,
+            "1744": 849621504.0,
+            "1745": 849621504.0,
+            "1746": 849621504.0,
+            "1747": 849621504.0,
+            "1748": 849621504.0,
+            "1749": 849621504.0,
+            "1750": 849621504.0,
+            "1751": 849621504.0,
+            "1752": 849621504.0,
+            "1753": 849621504.0,
+            "1754": 849621504.0,
+            "1755": 849621504.0,
+            "1756": 849621504.0,
+            "1757": 849621504.0,
+            "1758": 849621504.0,
+            "1759": 849621504.0,
+            "1760": 849621504.0,
+            "1761": 849621504.0,
+            "1762": 849621504.0,
+            "1763": 849621504.0,
+            "1764": 849621504.0,
+            "1765": 849621504.0,
+            "1766": 849621504.0,
+            "1767": 849621504.0,
+            "1768": 849621504.0,
+            "1769": 849621504.0,
+            "1770": 849621504.0,
+            "1771": 849621504.0,
+            "1772": 849621504.0,
+            "1773": 849621504.0,
+            "1774": 849621504.0,
+            "1775": 849621504.0,
+            "1776": 849621504.0,
+            "1777": 849621504.0,
+            "1778": 849621504.0,
+            "1779": 849621504.0,
+            "1780": 849621504.0,
+            "1781": 849621504.0,
+            "1782": 849621504.0,
+            "1783": 849621504.0,
+            "1784": 849621504.0,
+            "1785": 849621504.0,
+            "1786": 849621504.0,
+            "1787": 849621504.0,
+            "1788": 849621504.0,
+            "1789": 849621504.0,
+            "1790": 849621504.0,
+            "1791": 849621504.0,
+            "1792": 849621504.0,
+            "1793": 849621504.0,
+            "1794": 849621504.0,
+            "1795": 849621504.0,
+            "1796": 849621504.0,
+            "1797": 849621504.0,
+            "1798": 849621504.0,
+            "1799": 849621504.0,
+            "1800": 849621504.0,
+            "1801": 849621504.0,
+            "1802": 849621504.0,
+            "1803": 849621504.0,
+            "1804": 849621504.0,
+            "1805": 849621504.0,
+            "1806": 849621504.0,
+            "1807": 849621504.0,
+            "1808": 849621504.0,
+            "1809": 849621504.0,
+            "1810": 849621504.0,
+            "1811": 849621504.0,
+            "1812": 849621504.0,
+            "1813": 849621504.0,
+            "1814": 849621504.0,
+            "1815": 849621504.0,
+            "1816": 849621504.0,
+            "1817": 849621504.0,
+            "1818": 849621504.0,
+            "1819": 849621504.0,
+            "1820": 849621504.0,
+            "1821": 849621504.0,
+            "1822": 849621504.0,
+            "1823": 849621504.0,
+            "1824": 849621504.0,
+            "1825": 849621504.0,
+            "1826": 849621504.0,
+            "1827": 849621504.0,
+            "1828": 849621504.0,
+            "1829": 849621504.0,
+            "1830": 849621504.0,
+            "1831": 849621504.0,
+            "1832": 849621504.0,
+            "1833": 849621504.0,
+            "1834": 849621504.0,
+            "1835": 849621504.0,
+            "1836": 849621504.0,
+            "1837": 849621504.0,
+            "1838": 849621504.0,
+            "1839": 849621504.0,
+            "1840": 849621504.0,
+            "1841": 849621504.0,
+            "1842": 849621504.0,
+            "1843": 849621504.0,
+            "1844": 849621504.0,
+            "1845": 849621504.0,
+            "1846": 849621504.0,
+            "1847": 849621504.0,
+            "1848": 849621504.0,
+            "1849": 849621504.0,
+            "1850": 849621504.0,
+            "1851": 849621504.0,
+            "1852": 849621504.0,
+            "1853": 849621504.0,
+            "1854": 849621504.0,
+            "1855": 849621504.0,
+            "1856": 849621504.0,
+            "1857": 849621504.0,
+            "1858": 849621504.0,
+            "1859": 849621504.0,
+            "1860": 849621504.0,
+            "1861": 849621504.0,
+            "1862": 849621504.0,
+            "1863": 849621504.0,
+            "1864": 849621504.0,
+            "1865": 849621504.0,
+            "1866": 849621504.0,
+            "1867": 849621504.0,
+            "1868": 849621504.0,
+            "1869": 849621504.0,
+            "1870": 849621504.0,
+            "1871": 849621504.0,
+            "1872": 849621504.0,
+            "1873": 849621504.0,
+            "1874": 849621504.0,
+            "1875": 849621504.0,
+            "1876": 849621504.0,
+            "1877": 849621504.0,
+            "1878": 849621504.0,
+            "1879": 849621504.0,
+            "1880": 849621504.0,
+            "1881": 849621504.0,
+            "1882": 849621504.0,
+            "1883": 849621504.0,
+            "1884": 849621504.0,
+            "1885": 849621504.0,
+            "1886": 849621504.0,
+            "1887": 849621504.0,
+            "1888": 849621504.0,
+            "1889": 849621504.0,
+            "1890": 849621504.0,
+            "1891": 849621504.0,
+            "1892": 849621504.0,
+            "1893": 849621504.0,
+            "1894": 849621504.0,
+            "1895": 849621504.0,
+            "1896": 849621504.0,
+            "1897": 849621504.0,
+            "1898": 849621504.0,
+            "1899": 849621504.0,
+            "1900": 849621504.0,
+            "1901": 849621504.0,
+            "1902": 849621504.0,
+            "1903": 849621504.0,
+            "1904": 849621504.0,
+            "1905": 849621504.0,
+            "1906": 849621504.0,
+            "1907": 849621504.0,
+            "1908": 849621504.0,
+            "1909": 849621504.0,
+            "1910": 849621504.0,
+            "1911": 849621504.0,
+            "1912": 849621504.0,
+            "1913": 849621504.0,
+            "1914": 849621504.0,
+            "1915": 849621504.0,
+            "1916": 849621504.0,
+            "1917": 849621504.0,
+            "1918": 849621504.0,
+            "1919": 849621504.0,
+            "1920": 849621504.0,
+            "1921": 849621504.0,
+            "1922": 849621504.0,
+            "1923": 849621504.0,
+            "1924": 849621504.0,
+            "1925": 849621504.0,
+            "1926": 849621504.0,
+            "1927": 849621504.0,
+            "1928": 849621504.0,
+            "1929": 849621504.0,
+            "1930": 849621504.0,
+            "1931": 849621504.0,
+            "1932": 849621504.0,
+            "1933": 849621504.0,
+            "1934": 849621504.0,
+            "1935": 849621504.0,
+            "1936": 849621504.0,
+            "1937": 849621504.0,
+            "1938": 849621504.0,
+            "1939": 849621504.0,
+            "1940": 849621504.0,
+            "1941": 849621504.0,
+            "1942": 849621504.0,
+            "1943": 849621504.0,
+            "1944": 849621504.0,
+            "1945": 849621504.0,
+            "1946": 849621504.0,
+            "1947": 849621504.0,
+            "1948": 849621504.0,
+            "1949": 849621504.0,
+            "1950": 849621504.0,
+            "1951": 849621504.0,
+            "1952": 849621504.0,
+            "1953": 849621504.0,
+            "1954": 849621504.0,
+            "1955": 849621504.0,
+            "1956": 849621504.0,
+            "1957": 849621504.0,
+            "1958": 849621504.0,
+            "1959": 849621504.0,
+            "1960": 849621504.0,
+            "1961": 849621504.0,
+            "1962": 849621504.0,
+            "1963": 849621504.0,
+            "1964": 849621504.0,
+            "1965": 849621504.0,
+            "1966": 849621504.0,
+            "1967": 849621504.0,
+            "1968": 849621504.0,
+            "1969": 849621504.0,
+            "1970": 849621504.0,
+            "1971": 849621504.0,
+            "1972": 849621504.0,
+            "1973": 849621504.0,
+            "1974": 849621504.0,
+            "1975": 849621504.0,
+            "1976": 849621504.0,
+            "1977": 849621504.0,
+            "1978": 849621504.0,
+            "1979": 849621504.0,
+            "1980": 849621504.0,
+            "1981": 849621504.0,
+            "1982": 849621504.0,
+            "1983": 849621504.0,
+            "1984": 849621504.0,
+            "1985": 849621504.0,
+            "1986": 849621504.0,
+            "1987": 849621504.0,
+            "1988": 849621504.0,
+            "1989": 849621504.0,
+            "1990": 849621504.0,
+            "1991": 849621504.0,
+            "1992": 849621504.0,
+            "1993": 849621504.0,
+            "1994": 849621504.0,
+            "1995": 849621504.0,
+            "1996": 849621504.0,
+            "1997": 849621504.0,
+            "1998": 849621504.0,
+            "1999": 849621504.0,
+            "2000": 849621504.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 14.94115,
+            "2": 1.30868,
+            "3": 1.13391,
+            "4": 1.12792,
+            "5": 1.13103,
+            "6": 1.1383,
+            "7": 1.13573,
+            "8": 1.15789,
+            "9": 1.12704,
+            "10": 1.1241,
+            "11": 1.12786,
+            "12": 1.1288,
+            "13": 1.1399,
+            "14": 1.13165,
+            "15": 1.12333,
+            "16": 1.12398,
+            "17": 1.12493,
+            "18": 1.11586,
+            "19": 1.1123,
+            "20": 1.11192,
+            "21": 1.1266,
+            "22": 1.13629,
+            "23": 1.13171,
+            "24": 1.14969,
+            "25": 1.17022,
+            "26": 1.14634,
+            "27": 1.14242,
+            "28": 1.14353,
+            "29": 1.14554,
+            "30": 1.28826,
+            "31": 1.14265,
+            "32": 1.14023,
+            "33": 1.15286,
+            "34": 1.14975,
+            "35": 1.13988,
+            "36": 1.62757,
+            "37": 2.22703,
+            "38": 1.36074,
+            "39": 1.1325,
+            "40": 1.14106,
+            "41": 1.14114,
+            "42": 1.13305,
+            "43": 1.12375,
+            "44": 1.12631,
+            "45": 1.12358,
+            "46": 1.12334,
+            "47": 1.12398,
+            "48": 1.12749,
+            "49": 1.13897,
+            "50": 1.13563,
+            "51": 1.13628,
+            "52": 1.12935,
+            "53": 1.12779,
+            "54": 1.13147,
+            "55": 1.1279,
+            "56": 1.12777,
+            "57": 1.1269,
+            "58": 1.13989,
+            "59": 1.13378,
+            "60": 1.13552,
+            "61": 1.12879,
+            "62": 1.4796,
+            "63": 1.12843,
+            "64": 1.12488,
+            "65": 1.12888,
+            "66": 1.14028,
+            "67": 1.13532,
+            "68": 1.13278,
+            "69": 1.12779,
+            "70": 1.12468,
+            "71": 1.12483,
+            "72": 1.12423,
+            "73": 1.12335,
+            "74": 1.12699,
+            "75": 1.13379,
+            "76": 1.13001,
+            "77": 1.12994,
+            "78": 1.13166,
+            "79": 1.12415,
+            "80": 1.126,
+            "81": 1.16016,
+            "82": 1.13845,
+            "83": 1.13882,
+            "84": 1.14455,
+            "85": 1.46908,
+            "86": 1.1259,
+            "87": 1.12119,
+            "88": 1.12312,
+            "89": 1.12593,
+            "90": 1.51995,
+            "91": 1.16022,
+            "92": 1.1304,
+            "93": 1.13161,
+            "94": 1.13511,
+            "95": 1.13911,
+            "96": 1.80205,
+            "97": 1.13368,
+            "98": 1.13335,
+            "99": 1.13549,
+            "100": 1.13409,
+            "101": 1.13703,
+            "102": 1.14592,
+            "103": 1.13516,
+            "104": 1.13661,
+            "105": 1.13299,
+            "106": 1.13577,
+            "107": 1.13657,
+            "108": 1.13144,
+            "109": 1.14828,
+            "110": 1.15036,
+            "111": 1.1486,
+            "112": 1.14183,
+            "113": 1.14297,
+            "114": 1.1411,
+            "115": 1.14318,
+            "116": 1.14291,
+            "117": 1.14168,
+            "118": 1.15055,
+            "119": 1.1482,
+            "120": 1.15352,
+            "121": 1.13046,
+            "122": 1.145,
+            "123": 1.14278,
+            "124": 1.1428,
+            "125": 1.14189,
+            "126": 1.13609,
+            "127": 1.14025,
+            "128": 1.14097,
+            "129": 1.13489,
+            "130": 1.13417,
+            "131": 1.13581,
+            "132": 1.13708,
+            "133": 1.17896,
+            "134": 1.13176,
+            "135": 1.12984,
+            "136": 1.1435,
+            "137": 1.15088,
+            "138": 1.14391,
+            "139": 1.14409,
+            "140": 1.14238,
+            "141": 1.14313,
+            "142": 1.1493,
+            "143": 1.13518,
+            "144": 1.13229,
+            "145": 1.13749,
+            "146": 1.15049,
+            "147": 1.16077,
+            "148": 1.14254,
+            "149": 1.14071,
+            "150": 1.14075,
+            "151": 1.13943,
+            "152": 1.15276,
+            "153": 1.15369,
+            "154": 1.14618,
+            "155": 1.14225,
+            "156": 1.14285,
+            "157": 1.14106,
+            "158": 1.14415,
+            "159": 1.14445,
+            "160": 1.14934,
+            "161": 1.14229,
+            "162": 1.14167,
+            "163": 1.14058,
+            "164": 1.14064,
+            "165": 1.14012,
+            "166": 1.15198,
+            "167": 1.15221,
+            "168": 1.1471,
+            "169": 1.14122,
+            "170": 1.14769,
+            "171": 1.14073,
+            "172": 1.14205,
+            "173": 1.14583,
+            "174": 1.14217,
+            "175": 1.14015,
+            "176": 1.14319,
+            "177": 1.14097,
+            "178": 1.14115,
+            "179": 1.14122,
+            "180": 1.15137,
+            "181": 1.14856,
+            "182": 1.15203,
+            "183": 1.14535,
+            "184": 1.13997,
+            "185": 1.15174,
+            "186": 1.18192,
+            "187": 1.14929,
+            "188": 1.14842,
+            "189": 1.14724,
+            "190": 1.14922,
+            "191": 1.14932,
+            "192": 1.14856,
+            "193": 1.1562,
+            "194": 1.153,
+            "195": 1.16371,
+            "196": 1.14525,
+            "197": 1.1411,
+            "198": 1.14592,
+            "199": 1.14301,
+            "200": 1.15088,
+            "201": 1.14229,
+            "202": 1.14171,
+            "203": 1.14083,
+            "204": 1.13968,
+            "205": 1.13977,
+            "206": 1.14177,
+            "207": 1.15548,
+            "208": 1.15609,
+            "209": 1.14509,
+            "210": 1.1487,
+            "211": 1.14163,
+            "212": 1.13971,
+            "213": 1.15326,
+            "214": 1.14129,
+            "215": 1.14055,
+            "216": 1.13893,
+            "217": 1.14191,
+            "218": 1.1418,
+            "219": 1.14249,
+            "220": 1.14162,
+            "221": 1.14077,
+            "222": 1.15513,
+            "223": 1.15668,
+            "224": 1.14515,
+            "225": 1.14589,
+            "226": 1.14548,
+            "227": 1.14318,
+            "228": 1.14204,
+            "229": 1.14391,
+            "230": 1.14565,
+            "231": 1.1439,
+            "232": 1.14309,
+            "233": 1.14396,
+            "234": 1.14146,
+            "235": 1.14229,
+            "236": 1.14106,
+            "237": 1.14362,
+            "238": 1.15203,
+            "239": 1.1942,
+            "240": 1.18025,
+            "241": 1.15197,
+            "242": 1.15276,
+            "243": 1.15399,
+            "244": 1.15628,
+            "245": 1.14958,
+            "246": 1.14931,
+            "247": 1.14093,
+            "248": 1.13869,
+            "249": 1.1385,
+            "250": 1.13897,
+            "251": 1.13787,
+            "252": 1.13939,
+            "253": 1.17282,
+            "254": 1.13361,
+            "255": 1.13502,
+            "256": 1.13895,
+            "257": 1.16245,
+            "258": 1.1352,
+            "259": 1.15685,
+            "260": 1.14637,
+            "261": 1.2867,
+            "262": 1.13699,
+            "263": 1.13959,
+            "264": 1.15414,
+            "265": 1.14324,
+            "266": 1.14515,
+            "267": 1.14328,
+            "268": 1.14359,
+            "269": 1.144,
+            "270": 1.15446,
+            "271": 1.15182,
+            "272": 1.15575,
+            "273": 1.15561,
+            "274": 1.15762,
+            "275": 1.15307,
+            "276": 1.1516,
+            "277": 1.1569,
+            "278": 1.15789,
+            "279": 1.168,
+            "280": 1.16711,
+            "281": 1.16858,
+            "282": 1.16899,
+            "283": 1.15631,
+            "284": 1.15543,
+            "285": 1.15685,
+            "286": 1.15663,
+            "287": 1.15204,
+            "288": 1.15333,
+            "289": 1.15257,
+            "290": 1.14865,
+            "291": 1.15067,
+            "292": 1.15626,
+            "293": 1.15161,
+            "294": 1.15116,
+            "295": 1.15102,
+            "296": 1.15104,
+            "297": 1.17304,
+            "298": 1.17562,
+            "299": 1.17694,
+            "300": 1.15026,
+            "301": 1.15562,
+            "302": 1.15582,
+            "303": 1.15039,
+            "304": 1.14517,
+            "305": 1.14745,
+            "306": 1.15392,
+            "307": 1.15054,
+            "308": 1.14391,
+            "309": 1.1426,
+            "310": 1.1434,
+            "311": 1.14297,
+            "312": 1.14164,
+            "313": 1.15234,
+            "314": 1.14891,
+            "315": 1.14745,
+            "316": 1.15325,
+            "317": 1.15145,
+            "318": 1.51061,
+            "319": 1.13797,
+            "320": 1.13871,
+            "321": 1.20976,
+            "322": 1.19788,
+            "323": 1.14258,
+            "324": 1.14169,
+            "325": 1.14227,
+            "326": 1.1426,
+            "327": 1.14596,
+            "328": 1.14584,
+            "329": 1.14606,
+            "330": 1.13676,
+            "331": 1.14712,
+            "332": 1.14502,
+            "333": 1.14602,
+            "334": 1.14598,
+            "335": 1.15781,
+            "336": 1.15666,
+            "337": 1.1498,
+            "338": 1.15651,
+            "339": 1.15267,
+            "340": 1.14703,
+            "341": 1.14889,
+            "342": 1.14863,
+            "343": 1.14731,
+            "344": 1.1479,
+            "345": 1.20819,
+            "346": 1.15653,
+            "347": 1.15548,
+            "348": 1.15594,
+            "349": 1.15558,
+            "350": 1.15652,
+            "351": 1.15348,
+            "352": 1.15517,
+            "353": 1.15665,
+            "354": 1.15895,
+            "355": 1.15829,
+            "356": 1.16229,
+            "357": 1.17016,
+            "358": 1.16317,
+            "359": 1.18492,
+            "360": 1.20126,
+            "361": 1.19034,
+            "362": 1.18723,
+            "363": 1.16724,
+            "364": 1.14627,
+            "365": 1.14394,
+            "366": 1.14503,
+            "367": 1.14264,
+            "368": 1.14464,
+            "369": 1.14478,
+            "370": 1.14447,
+            "371": 1.15012,
+            "372": 1.14509,
+            "373": 1.14362,
+            "374": 1.14617,
+            "375": 1.14658,
+            "376": 1.13748,
+            "377": 1.15141,
+            "378": 1.14564,
+            "379": 1.14278,
+            "380": 1.14166,
+            "381": 1.14361,
+            "382": 1.14293,
+            "383": 1.14196,
+            "384": 1.14178,
+            "385": 1.14053,
+            "386": 1.14184,
+            "387": 1.14451,
+            "388": 1.14162,
+            "389": 1.1419,
+            "390": 1.14477,
+            "391": 1.15539,
+            "392": 1.16117,
+            "393": 1.16925,
+            "394": 1.16815,
+            "395": 1.1561,
+            "396": 1.15146,
+            "397": 1.15422,
+            "398": 1.14884,
+            "399": 1.14136,
+            "400": 1.14059,
+            "401": 1.14105,
+            "402": 1.14013,
+            "403": 1.15094,
+            "404": 1.13492,
+            "405": 1.1425,
+            "406": 1.14173,
+            "407": 1.14385,
+            "408": 1.14421,
+            "409": 1.14226,
+            "410": 1.1417,
+            "411": 1.1511,
+            "412": 1.15763,
+            "413": 1.15891,
+            "414": 1.15294,
+            "415": 1.15191,
+            "416": 1.15346,
+            "417": 1.15001,
+            "418": 1.15279,
+            "419": 1.14974,
+            "420": 1.14848,
+            "421": 1.14722,
+            "422": 1.15396,
+            "423": 1.1499,
+            "424": 1.15269,
+            "425": 1.15087,
+            "426": 1.14945,
+            "427": 1.15106,
+            "428": 1.15515,
+            "429": 1.14379,
+            "430": 1.16231,
+            "431": 1.18658,
+            "432": 1.17212,
+            "433": 1.16725,
+            "434": 1.17832,
+            "435": 1.16254,
+            "436": 1.16094,
+            "437": 1.15865,
+            "438": 1.16104,
+            "439": 1.1621,
+            "440": 1.13911,
+            "441": 1.13485,
+            "442": 1.13534,
+            "443": 1.13627,
+            "444": 1.13432,
+            "445": 1.13868,
+            "446": 1.13561,
+            "447": 1.13518,
+            "448": 1.1365,
+            "449": 1.13444,
+            "450": 1.13455,
+            "451": 1.14098,
+            "452": 1.15368,
+            "453": 1.1566,
+            "454": 1.15931,
+            "455": 1.18151,
+            "456": 1.16215,
+            "457": 1.16012,
+            "458": 1.15916,
+            "459": 1.15837,
+            "460": 1.16214,
+            "461": 1.1652,
+            "462": 1.16044,
+            "463": 1.16179,
+            "464": 1.163,
+            "465": 1.16332,
+            "466": 1.15968,
+            "467": 1.16196,
+            "468": 1.1592,
+            "469": 1.15988,
+            "470": 1.16081,
+            "471": 1.16128,
+            "472": 1.15868,
+            "473": 1.16004,
+            "474": 1.16125,
+            "475": 1.15956,
+            "476": 1.16733,
+            "477": 1.18857,
+            "478": 1.15838,
+            "479": 1.16068,
+            "480": 1.16004,
+            "481": 1.15956,
+            "482": 1.15757,
+            "483": 1.15802,
+            "484": 1.16061,
+            "485": 1.15848,
+            "486": 1.16058,
+            "487": 1.15819,
+            "488": 1.15991,
+            "489": 1.15831,
+            "490": 1.1589,
+            "491": 1.16144,
+            "492": 1.15934,
+            "493": 1.15973,
+            "494": 1.16104,
+            "495": 1.15933,
+            "496": 1.16173,
+            "497": 1.16203,
+            "498": 1.16059,
+            "499": 1.16461,
+            "500": 1.16533,
+            "501": 1.1723,
+            "502": 1.17075,
+            "503": 1.17256,
+            "504": 1.16176,
+            "505": 1.15972,
+            "506": 1.16185,
+            "507": 1.21311,
+            "508": 1.16326,
+            "509": 1.15384,
+            "510": 1.15071,
+            "511": 1.15307,
+            "512": 1.15748,
+            "513": 1.1518,
+            "514": 1.15181,
+            "515": 1.15338,
+            "516": 1.1524,
+            "517": 1.15481,
+            "518": 1.15358,
+            "519": 1.16302,
+            "520": 1.16218,
+            "521": 1.15461,
+            "522": 1.157,
+            "523": 1.15817,
+            "524": 1.15517,
+            "525": 1.15361,
+            "526": 1.15183,
+            "527": 1.15237,
+            "528": 1.15423,
+            "529": 1.15637,
+            "530": 1.15521,
+            "531": 1.15012,
+            "532": 1.15132,
+            "533": 1.1495,
+            "534": 1.14919,
+            "535": 1.1546,
+            "536": 1.15442,
+            "537": 1.1514,
+            "538": 1.15195,
+            "539": 1.15221,
+            "540": 1.15639,
+            "541": 1.1549,
+            "542": 1.15495,
+            "543": 1.15683,
+            "544": 1.16361,
+            "545": 1.16186,
+            "546": 1.15697,
+            "547": 1.15978,
+            "548": 1.16151,
+            "549": 1.15737,
+            "550": 1.15451,
+            "551": 1.16057,
+            "552": 1.20604,
+            "553": 1.15937,
+            "554": 1.21638,
+            "555": 1.16193,
+            "556": 1.16004,
+            "557": 1.15937,
+            "558": 1.15924,
+            "559": 1.15864,
+            "560": 1.16064,
+            "561": 1.15935,
+            "562": 1.43389,
+            "563": 1.16041,
+            "564": 1.16122,
+            "565": 1.49173,
+            "566": 1.15954,
+            "567": 1.17345,
+            "568": 1.16261,
+            "569": 1.15966,
+            "570": 1.1607,
+            "571": 1.15553,
+            "572": 1.1568,
+            "573": 1.15385,
+            "574": 1.15701,
+            "575": 1.15849,
+            "576": 1.15634,
+            "577": 1.15908,
+            "578": 1.15576,
+            "579": 1.15627,
+            "580": 1.14973,
+            "581": 1.16027,
+            "582": 1.16176,
+            "583": 1.15493,
+            "584": 1.15722,
+            "585": 1.15744,
+            "586": 1.15502,
+            "587": 1.1559,
+            "588": 1.15496,
+            "589": 1.16378,
+            "590": 1.16595,
+            "591": 1.16611,
+            "592": 1.16989,
+            "593": 1.16842,
+            "594": 1.17261,
+            "595": 1.15925,
+            "596": 1.16083,
+            "597": 1.16113,
+            "598": 1.16297,
+            "599": 1.16456,
+            "600": 1.15983,
+            "601": 1.16187,
+            "602": 1.15943,
+            "603": 1.15985,
+            "604": 1.1592,
+            "605": 1.15871,
+            "606": 1.16032,
+            "607": 1.15919,
+            "608": 1.17988,
+            "609": 1.16067,
+            "610": 1.18157,
+            "611": 1.15299,
+            "612": 1.15282,
+            "613": 1.15274,
+            "614": 1.15344,
+            "615": 1.15192,
+            "616": 1.15757,
+            "617": 1.15404,
+            "618": 1.16198,
+            "619": 1.12381,
+            "620": 1.11492,
+            "621": 1.14943,
+            "622": 1.16512,
+            "623": 1.16958,
+            "624": 1.16409,
+            "625": 1.15844,
+            "626": 1.14917,
+            "627": 1.15285,
+            "628": 1.15477,
+            "629": 1.15363,
+            "630": 1.15213,
+            "631": 1.14647,
+            "632": 1.14867,
+            "633": 1.15423,
+            "634": 1.15566,
+            "635": 1.15345,
+            "636": 1.15319,
+            "637": 1.1511,
+            "638": 1.15409,
+            "639": 1.15188,
+            "640": 1.15258,
+            "641": 1.15414,
+            "642": 1.15983,
+            "643": 1.15819,
+            "644": 1.15887,
+            "645": 1.15631,
+            "646": 1.15765,
+            "647": 1.16277,
+            "648": 1.16768,
+            "649": 1.17095,
+            "650": 1.16972,
+            "651": 1.16894,
+            "652": 1.16584,
+            "653": 1.1612,
+            "654": 1.17303,
+            "655": 1.16406,
+            "656": 1.1617,
+            "657": 1.16573,
+            "658": 1.16082,
+            "659": 1.16677,
+            "660": 1.16969,
+            "661": 1.16374,
+            "662": 1.16155,
+            "663": 1.16674,
+            "664": 1.16865,
+            "665": 1.16719,
+            "666": 1.16772,
+            "667": 1.16872,
+            "668": 1.16616,
+            "669": 1.16505,
+            "670": 1.16449,
+            "671": 1.16777,
+            "672": 1.16457,
+            "673": 1.16059,
+            "674": 1.16013,
+            "675": 1.1589,
+            "676": 1.1645,
+            "677": 1.16737,
+            "678": 1.16262,
+            "679": 1.44417,
+            "680": 1.16641,
+            "681": 1.16441,
+            "682": 1.16834,
+            "683": 1.17163,
+            "684": 1.16041,
+            "685": 1.16815,
+            "686": 1.16615,
+            "687": 1.1689,
+            "688": 1.16377,
+            "689": 1.16277,
+            "690": 1.15926,
+            "691": 1.15823,
+            "692": 1.15747,
+            "693": 1.15897,
+            "694": 1.15722,
+            "695": 1.15679,
+            "696": 1.15619,
+            "697": 1.15686,
+            "698": 1.15548,
+            "699": 1.15619,
+            "700": 1.15662,
+            "701": 1.15701,
+            "702": 1.15611,
+            "703": 1.1578,
+            "704": 1.15921,
+            "705": 1.15626,
+            "706": 1.15696,
+            "707": 1.15676,
+            "708": 1.15718,
+            "709": 1.15643,
+            "710": 1.16154,
+            "711": 1.15995,
+            "712": 1.159,
+            "713": 1.16786,
+            "714": 1.15799,
+            "715": 1.15749,
+            "716": 1.52131,
+            "717": 1.15676,
+            "718": 1.16066,
+            "719": 1.15878,
+            "720": 1.16243,
+            "721": 1.15801,
+            "722": 1.16032,
+            "723": 1.15929,
+            "724": 1.16338,
+            "725": 1.15949,
+            "726": 1.16444,
+            "727": 1.31697,
+            "728": 1.15571,
+            "729": 1.15513,
+            "730": 1.15845,
+            "731": 1.16172,
+            "732": 1.15814,
+            "733": 1.1597,
+            "734": 1.15388,
+            "735": 1.15282,
+            "736": 1.15589,
+            "737": 1.15547,
+            "738": 1.1547,
+            "739": 1.15614,
+            "740": 1.15546,
+            "741": 1.15558,
+            "742": 1.15607,
+            "743": 1.15425,
+            "744": 1.15442,
+            "745": 1.16502,
+            "746": 1.15566,
+            "747": 1.15865,
+            "748": 1.15828,
+            "749": 1.16418,
+            "750": 1.15709,
+            "751": 1.15988,
+            "752": 1.15915,
+            "753": 1.15069,
+            "754": 1.15176,
+            "755": 1.15161,
+            "756": 1.1502,
+            "757": 1.14643,
+            "758": 1.7155,
+            "759": 1.15471,
+            "760": 1.15638,
+            "761": 1.15684,
+            "762": 1.16005,
+            "763": 1.1585,
+            "764": 1.16197,
+            "765": 1.22988,
+            "766": 1.16563,
+            "767": 1.16594,
+            "768": 1.16751,
+            "769": 1.16167,
+            "770": 1.16736,
+            "771": 1.16232,
+            "772": 1.16021,
+            "773": 1.16138,
+            "774": 1.16446,
+            "775": 1.15216,
+            "776": 1.15086,
+            "777": 1.15506,
+            "778": 1.15465,
+            "779": 1.15872,
+            "780": 1.15533,
+            "781": 1.15836,
+            "782": 1.15778,
+            "783": 1.21735,
+            "784": 1.15535,
+            "785": 1.14905,
+            "786": 1.14868,
+            "787": 1.14899,
+            "788": 1.1521,
+            "789": 1.1498,
+            "790": 1.15389,
+            "791": 1.15198,
+            "792": 1.14834,
+            "793": 1.14935,
+            "794": 1.14986,
+            "795": 1.15066,
+            "796": 1.15229,
+            "797": 1.15036,
+            "798": 1.15026,
+            "799": 1.15231,
+            "800": 1.15717,
+            "801": 1.15355,
+            "802": 1.15502,
+            "803": 1.15201,
+            "804": 1.15023,
+            "805": 1.15209,
+            "806": 1.15072,
+            "807": 1.48449,
+            "808": 1.15218,
+            "809": 1.1522,
+            "810": 1.15111,
+            "811": 1.15134,
+            "812": 1.15187,
+            "813": 1.15379,
+            "814": 1.15585,
+            "815": 1.16392,
+            "816": 1.15452,
+            "817": 1.15487,
+            "818": 1.15245,
+            "819": 1.14836,
+            "820": 1.14547,
+            "821": 1.74382,
+            "822": 1.14655,
+            "823": 1.13629,
+            "824": 1.15244,
+            "825": 1.14064,
+            "826": 1.14002,
+            "827": 1.14234,
+            "828": 1.1401,
+            "829": 1.13945,
+            "830": 1.14243,
+            "831": 1.14339,
+            "832": 1.13963,
+            "833": 1.14165,
+            "834": 1.13931,
+            "835": 1.13828,
+            "836": 1.13924,
+            "837": 1.13918,
+            "838": 1.14038,
+            "839": 1.14023,
+            "840": 1.13827,
+            "841": 1.14334,
+            "842": 1.26736,
+            "843": 1.15235,
+            "844": 1.16327,
+            "845": 1.15615,
+            "846": 1.15656,
+            "847": 1.14563,
+            "848": 1.14836,
+            "849": 1.14901,
+            "850": 1.14852,
+            "851": 1.15019,
+            "852": 1.14893,
+            "853": 1.14907,
+            "854": 1.14895,
+            "855": 1.14997,
+            "856": 1.14951,
+            "857": 1.15014,
+            "858": 1.14881,
+            "859": 1.15072,
+            "860": 1.16126,
+            "861": 1.15807,
+            "862": 1.15716,
+            "863": 1.15555,
+            "864": 1.15038,
+            "865": 1.15177,
+            "866": 1.15177,
+            "867": 1.14884,
+            "868": 1.14782,
+            "869": 1.15086,
+            "870": 1.14982,
+            "871": 1.14833,
+            "872": 1.14875,
+            "873": 1.15147,
+            "874": 1.15225,
+            "875": 1.29099,
+            "876": 2.39847,
+            "877": 2.16612,
+            "878": 1.53276,
+            "879": 1.14604,
+            "880": 1.1515,
+            "881": 1.16208,
+            "882": 1.15925,
+            "883": 1.14916,
+            "884": 1.14927,
+            "885": 1.1758,
+            "886": 1.17545,
+            "887": 1.17369,
+            "888": 1.17655,
+            "889": 1.16376,
+            "890": 1.14874,
+            "891": 1.148,
+            "892": 1.14787,
+            "893": 1.15123,
+            "894": 1.15168,
+            "895": 1.15419,
+            "896": 1.15535,
+            "897": 1.15242,
+            "898": 1.15508,
+            "899": 1.15225,
+            "900": 1.15072,
+            "901": 1.1534,
+            "902": 1.15136,
+            "903": 1.15481,
+            "904": 1.15989,
+            "905": 1.16184,
+            "906": 1.14716,
+            "907": 1.15192,
+            "908": 1.15696,
+            "909": 1.15328,
+            "910": 1.14059,
+            "911": 1.1604,
+            "912": 1.14941,
+            "913": 1.14972,
+            "914": 1.14954,
+            "915": 1.15073,
+            "916": 1.14475,
+            "917": 1.15414,
+            "918": 1.1385,
+            "919": 1.14185,
+            "920": 1.14089,
+            "921": 1.13784,
+            "922": 1.13875,
+            "923": 1.13882,
+            "924": 1.14141,
+            "925": 1.13908,
+            "926": 1.13874,
+            "927": 1.13823,
+            "928": 1.13737,
+            "929": 1.13836,
+            "930": 1.13809,
+            "931": 1.14893,
+            "932": 1.13972,
+            "933": 1.1369,
+            "934": 1.1362,
+            "935": 1.13765,
+            "936": 1.14369,
+            "937": 1.1504,
+            "938": 1.14208,
+            "939": 1.14841,
+            "940": 1.14975,
+            "941": 1.14225,
+            "942": 1.14185,
+            "943": 1.13864,
+            "944": 1.13915,
+            "945": 1.14062,
+            "946": 1.15111,
+            "947": 1.14071,
+            "948": 1.13898,
+            "949": 1.1399,
+            "950": 1.15937,
+            "951": 1.16785,
+            "952": 1.16807,
+            "953": 1.1506,
+            "954": 1.15006,
+            "955": 1.15045,
+            "956": 1.17067,
+            "957": 1.14856,
+            "958": 1.14992,
+            "959": 1.15251,
+            "960": 1.15045,
+            "961": 1.15121,
+            "962": 1.14957,
+            "963": 1.15095,
+            "964": 1.15,
+            "965": 1.15089,
+            "966": 1.15156,
+            "967": 1.15423,
+            "968": 1.16332,
+            "969": 1.15359,
+            "970": 1.15613,
+            "971": 1.15232,
+            "972": 1.15652,
+            "973": 1.15399,
+            "974": 1.15065,
+            "975": 1.1485,
+            "976": 1.15243,
+            "977": 1.15368,
+            "978": 1.14828,
+            "979": 1.14969,
+            "980": 1.15374,
+            "981": 1.1505,
+            "982": 1.15031,
+            "983": 1.15033,
+            "984": 1.14921,
+            "985": 1.15504,
+            "986": 1.15572,
+            "987": 1.153,
+            "988": 1.15573,
+            "989": 1.14747,
+            "990": 1.14636,
+            "991": 1.14517,
+            "992": 1.1463,
+            "993": 1.14805,
+            "994": 1.14644,
+            "995": 1.14583,
+            "996": 1.14485,
+            "997": 1.14418,
+            "998": 1.14622,
+            "999": 1.14662,
+            "1000": 1.14312,
+            "1001": 1.15227,
+            "1002": 1.14681,
+            "1003": 1.14794,
+            "1004": 1.14889,
+            "1005": 1.15067,
+            "1006": 1.14757,
+            "1007": 1.14767,
+            "1008": 1.15061,
+            "1009": 1.15075,
+            "1010": 1.14894,
+            "1011": 1.14975,
+            "1012": 1.14667,
+            "1013": 1.14688,
+            "1014": 1.14788,
+            "1015": 1.167,
+            "1016": 1.44606,
+            "1017": 1.14923,
+            "1018": 1.15268,
+            "1019": 1.14981,
+            "1020": 1.15011,
+            "1021": 1.47391,
+            "1022": 1.15277,
+            "1023": 1.14774,
+            "1024": 1.146,
+            "1025": 1.15253,
+            "1026": 1.14633,
+            "1027": 1.14525,
+            "1028": 1.14728,
+            "1029": 1.14654,
+            "1030": 1.14663,
+            "1031": 1.14708,
+            "1032": 1.14715,
+            "1033": 1.1454,
+            "1034": 1.14763,
+            "1035": 1.14591,
+            "1036": 1.14493,
+            "1037": 1.14584,
+            "1038": 1.14665,
+            "1039": 1.14812,
+            "1040": 1.14495,
+            "1041": 1.15044,
+            "1042": 1.14701,
+            "1043": 1.14657,
+            "1044": 1.14631,
+            "1045": 1.14822,
+            "1046": 1.14789,
+            "1047": 1.14525,
+            "1048": 1.14815,
+            "1049": 1.14939,
+            "1050": 1.14592,
+            "1051": 1.14667,
+            "1052": 1.15232,
+            "1053": 1.14863,
+            "1054": 1.14908,
+            "1055": 1.14931,
+            "1056": 1.14644,
+            "1057": 1.149,
+            "1058": 1.14751,
+            "1059": 1.14668,
+            "1060": 1.14758,
+            "1061": 1.14789,
+            "1062": 1.43562,
+            "1063": 1.14875,
+            "1064": 1.14846,
+            "1065": 1.14888,
+            "1066": 1.15486,
+            "1067": 1.15212,
+            "1068": 1.14934,
+            "1069": 1.14526,
+            "1070": 1.14506,
+            "1071": 1.14599,
+            "1072": 1.14774,
+            "1073": 1.14651,
+            "1074": 1.14609,
+            "1075": 1.14817,
+            "1076": 1.14662,
+            "1077": 1.15159,
+            "1078": 1.14735,
+            "1079": 1.14525,
+            "1080": 1.1516,
+            "1081": 1.14601,
+            "1082": 1.13989,
+            "1083": 1.13569,
+            "1084": 1.1371,
+            "1085": 1.1366,
+            "1086": 1.13713,
+            "1087": 1.13756,
+            "1088": 1.13768,
+            "1089": 1.13917,
+            "1090": 1.13759,
+            "1091": 1.13884,
+            "1092": 1.13707,
+            "1093": 1.13679,
+            "1094": 1.13513,
+            "1095": 1.1351,
+            "1096": 1.13494,
+            "1097": 1.13589,
+            "1098": 1.14132,
+            "1099": 1.13697,
+            "1100": 1.14195,
+            "1101": 1.14189,
+            "1102": 1.13736,
+            "1103": 1.13781,
+            "1104": 1.14284,
+            "1105": 1.13518,
+            "1106": 1.13585,
+            "1107": 1.13621,
+            "1108": 1.13665,
+            "1109": 1.13792,
+            "1110": 1.13764,
+            "1111": 1.13778,
+            "1112": 1.13619,
+            "1113": 1.13651,
+            "1114": 1.13628,
+            "1115": 1.13802,
+            "1116": 1.13792,
+            "1117": 1.13642,
+            "1118": 1.13784,
+            "1119": 1.14898,
+            "1120": 1.15049,
+            "1121": 1.15028,
+            "1122": 1.14509,
+            "1123": 1.1445,
+            "1124": 1.14756,
+            "1125": 1.15117,
+            "1126": 1.14917,
+            "1127": 1.1475,
+            "1128": 1.1481,
+            "1129": 1.14683,
+            "1130": 1.14088,
+            "1131": 1.13493,
+            "1132": 1.13613,
+            "1133": 1.13537,
+            "1134": 1.13473,
+            "1135": 1.13657,
+            "1136": 1.13516,
+            "1137": 1.13606,
+            "1138": 1.13473,
+            "1139": 1.13442,
+            "1140": 1.13398,
+            "1141": 1.13591,
+            "1142": 1.13975,
+            "1143": 1.13478,
+            "1144": 1.13376,
+            "1145": 1.13428,
+            "1146": 1.1348,
+            "1147": 1.13462,
+            "1148": 1.1351,
+            "1149": 1.13494,
+            "1150": 1.13506,
+            "1151": 1.13487,
+            "1152": 1.14039,
+            "1153": 1.13991,
+            "1154": 1.13825,
+            "1155": 1.1373,
+            "1156": 1.13451,
+            "1157": 1.13683,
+            "1158": 1.13335,
+            "1159": 1.13548,
+            "1160": 1.1339,
+            "1161": 1.13613,
+            "1162": 1.13429,
+            "1163": 1.13448,
+            "1164": 1.13542,
+            "1165": 1.13453,
+            "1166": 1.13398,
+            "1167": 1.13549,
+            "1168": 1.1342,
+            "1169": 1.13502,
+            "1170": 1.13535,
+            "1171": 1.13581,
+            "1172": 1.13532,
+            "1173": 1.13552,
+            "1174": 1.13371,
+            "1175": 1.13456,
+            "1176": 1.13401,
+            "1177": 1.1335,
+            "1178": 1.13628,
+            "1179": 1.13907,
+            "1180": 1.13757,
+            "1181": 1.1538,
+            "1182": 1.15712,
+            "1183": 1.16123,
+            "1184": 1.15318,
+            "1185": 1.14801,
+            "1186": 1.14711,
+            "1187": 1.1471,
+            "1188": 1.15109,
+            "1189": 1.14707,
+            "1190": 1.14787,
+            "1191": 1.1451,
+            "1192": 1.14677,
+            "1193": 1.14621,
+            "1194": 1.14554,
+            "1195": 1.14738,
+            "1196": 1.14756,
+            "1197": 1.14799,
+            "1198": 1.1487,
+            "1199": 1.14616,
+            "1200": 1.14688,
+            "1201": 1.14531,
+            "1202": 1.14639,
+            "1203": 1.14696,
+            "1204": 1.1469,
+            "1205": 1.1472,
+            "1206": 1.14687,
+            "1207": 1.1494,
+            "1208": 1.14873,
+            "1209": 1.15175,
+            "1210": 1.14868,
+            "1211": 1.14793,
+            "1212": 1.14766,
+            "1213": 1.14823,
+            "1214": 1.15557,
+            "1215": 1.15986,
+            "1216": 1.14175,
+            "1217": 1.1392,
+            "1218": 1.13591,
+            "1219": 1.13796,
+            "1220": 1.14086,
+            "1221": 1.14081,
+            "1222": 1.13816,
+            "1223": 1.13977,
+            "1224": 1.14436,
+            "1225": 1.13986,
+            "1226": 1.13821,
+            "1227": 1.13854,
+            "1228": 1.13738,
+            "1229": 1.1384,
+            "1230": 1.13897,
+            "1231": 1.13732,
+            "1232": 1.13852,
+            "1233": 1.14144,
+            "1234": 1.13711,
+            "1235": 1.14105,
+            "1236": 1.13578,
+            "1237": 1.13838,
+            "1238": 1.13809,
+            "1239": 1.13782,
+            "1240": 1.13859,
+            "1241": 1.1381,
+            "1242": 1.13717,
+            "1243": 1.14814,
+            "1244": 1.16451,
+            "1245": 1.17765,
+            "1246": 1.17167,
+            "1247": 1.15708,
+            "1248": 1.15406,
+            "1249": 1.17391,
+            "1250": 1.14803,
+            "1251": 1.14601,
+            "1252": 1.14796,
+            "1253": 1.14706,
+            "1254": 1.14679,
+            "1255": 1.14306,
+            "1256": 1.14387,
+            "1257": 1.14608,
+            "1258": 1.14617,
+            "1259": 1.14999,
+            "1260": 1.1468,
+            "1261": 1.14332,
+            "1262": 1.15005,
+            "1263": 1.1449,
+            "1264": 1.14544,
+            "1265": 1.14292,
+            "1266": 1.14481,
+            "1267": 1.154,
+            "1268": 1.15455,
+            "1269": 1.15329,
+            "1270": 1.15008,
+            "1271": 1.15345,
+            "1272": 1.14616,
+            "1273": 1.15423,
+            "1274": 1.15349,
+            "1275": 1.14785,
+            "1276": 1.14536,
+            "1277": 1.14467,
+            "1278": 1.1456,
+            "1279": 1.14593,
+            "1280": 1.1462,
+            "1281": 1.14599,
+            "1282": 1.14837,
+            "1283": 1.14585,
+            "1284": 1.14656,
+            "1285": 1.14618,
+            "1286": 1.14615,
+            "1287": 1.14657,
+            "1288": 1.44686,
+            "1289": 1.14572,
+            "1290": 1.14398,
+            "1291": 1.1431,
+            "1292": 1.14524,
+            "1293": 1.14421,
+            "1294": 1.14593,
+            "1295": 1.16051,
+            "1296": 1.16214,
+            "1297": 1.15606,
+            "1298": 1.14439,
+            "1299": 1.14445,
+            "1300": 1.1445,
+            "1301": 1.1455,
+            "1302": 1.14117,
+            "1303": 1.14365,
+            "1304": 1.14474,
+            "1305": 1.14456,
+            "1306": 1.14522,
+            "1307": 1.144,
+            "1308": 1.14453,
+            "1309": 1.14471,
+            "1310": 1.1456,
+            "1311": 1.15495,
+            "1312": 1.15256,
+            "1313": 1.14805,
+            "1314": 1.14996,
+            "1315": 1.14425,
+            "1316": 1.14401,
+            "1317": 1.14262,
+            "1318": 1.14556,
+            "1319": 1.14661,
+            "1320": 1.14567,
+            "1321": 1.14648,
+            "1322": 1.14709,
+            "1323": 1.14522,
+            "1324": 1.14764,
+            "1325": 1.14331,
+            "1326": 1.14538,
+            "1327": 1.1453,
+            "1328": 1.14734,
+            "1329": 1.18619,
+            "1330": 1.48212,
+            "1331": 1.14651,
+            "1332": 1.15204,
+            "1333": 1.14629,
+            "1334": 1.14624,
+            "1335": 1.14927,
+            "1336": 1.14601,
+            "1337": 1.15642,
+            "1338": 1.14811,
+            "1339": 1.14508,
+            "1340": 1.15069,
+            "1341": 1.14629,
+            "1342": 1.14635,
+            "1343": 1.14657,
+            "1344": 1.14655,
+            "1345": 1.14564,
+            "1346": 1.14633,
+            "1347": 1.14523,
+            "1348": 1.14691,
+            "1349": 1.14575,
+            "1350": 1.14592,
+            "1351": 1.14631,
+            "1352": 1.14436,
+            "1353": 1.14573,
+            "1354": 1.14471,
+            "1355": 1.14554,
+            "1356": 1.14492,
+            "1357": 1.14301,
+            "1358": 1.141,
+            "1359": 1.14219,
+            "1360": 1.14228,
+            "1361": 1.14109,
+            "1362": 1.1413,
+            "1363": 1.14096,
+            "1364": 1.15355,
+            "1365": 1.14229,
+            "1366": 1.14615,
+            "1367": 1.14174,
+            "1368": 1.13953,
+            "1369": 1.14014,
+            "1370": 1.14132,
+            "1371": 1.14139,
+            "1372": 1.13849,
+            "1373": 1.14304,
+            "1374": 1.14028,
+            "1375": 1.13912,
+            "1376": 1.14082,
+            "1377": 1.1416,
+            "1378": 1.13936,
+            "1379": 1.13866,
+            "1380": 1.13826,
+            "1381": 1.14443,
+            "1382": 1.14029,
+            "1383": 1.13913,
+            "1384": 1.14177,
+            "1385": 1.14492,
+            "1386": 1.1415,
+            "1387": 1.1398,
+            "1388": 1.14017,
+            "1389": 1.14077,
+            "1390": 1.14782,
+            "1391": 1.15011,
+            "1392": 1.15174,
+            "1393": 1.14605,
+            "1394": 1.14761,
+            "1395": 1.14735,
+            "1396": 1.14827,
+            "1397": 1.14566,
+            "1398": 1.14659,
+            "1399": 1.14187,
+            "1400": 1.14737,
+            "1401": 1.14674,
+            "1402": 1.14468,
+            "1403": 1.14534,
+            "1404": 1.14726,
+            "1405": 1.14773,
+            "1406": 1.14711,
+            "1407": 1.14543,
+            "1408": 1.14568,
+            "1409": 1.14559,
+            "1410": 1.14443,
+            "1411": 1.14591,
+            "1412": 1.14444,
+            "1413": 1.14904,
+            "1414": 1.14806,
+            "1415": 1.14757,
+            "1416": 1.14307,
+            "1417": 1.14119,
+            "1418": 1.14392,
+            "1419": 1.14104,
+            "1420": 1.14278,
+            "1421": 1.13949,
+            "1422": 1.14028,
+            "1423": 1.14112,
+            "1424": 1.14151,
+            "1425": 1.14321,
+            "1426": 1.14894,
+            "1427": 1.14281,
+            "1428": 1.14881,
+            "1429": 1.14225,
+            "1430": 1.13905,
+            "1431": 1.14148,
+            "1432": 1.14895,
+            "1433": 1.15186,
+            "1434": 1.14773,
+            "1435": 1.14968,
+            "1436": 1.14689,
+            "1437": 1.1487,
+            "1438": 1.14731,
+            "1439": 1.14746,
+            "1440": 1.14835,
+            "1441": 1.15151,
+            "1442": 1.15182,
+            "1443": 1.15073,
+            "1444": 1.14751,
+            "1445": 1.15081,
+            "1446": 1.15106,
+            "1447": 1.14876,
+            "1448": 1.15178,
+            "1449": 1.15117,
+            "1450": 1.1479,
+            "1451": 1.14851,
+            "1452": 1.14502,
+            "1453": 1.1454,
+            "1454": 1.14722,
+            "1455": 1.14628,
+            "1456": 1.14413,
+            "1457": 1.14761,
+            "1458": 1.14681,
+            "1459": 1.14632,
+            "1460": 1.14804,
+            "1461": 1.14676,
+            "1462": 1.14566,
+            "1463": 1.14599,
+            "1464": 1.14679,
+            "1465": 1.14572,
+            "1466": 1.14995,
+            "1467": 1.14848,
+            "1468": 1.14679,
+            "1469": 1.15027,
+            "1470": 1.14636,
+            "1471": 1.14406,
+            "1472": 1.14039,
+            "1473": 1.13768,
+            "1474": 1.13897,
+            "1475": 1.14331,
+            "1476": 1.1403,
+            "1477": 1.14139,
+            "1478": 1.14985,
+            "1479": 1.14611,
+            "1480": 1.47655,
+            "1481": 1.45511,
+            "1482": 1.14381,
+            "1483": 1.13941,
+            "1484": 1.13782,
+            "1485": 1.13771,
+            "1486": 1.13796,
+            "1487": 1.13795,
+            "1488": 1.13829,
+            "1489": 1.13758,
+            "1490": 1.13822,
+            "1491": 1.13667,
+            "1492": 1.13847,
+            "1493": 1.13787,
+            "1494": 1.14072,
+            "1495": 1.14614,
+            "1496": 1.14436,
+            "1497": 1.14422,
+            "1498": 1.1393,
+            "1499": 1.13987,
+            "1500": 1.13991,
+            "1501": 1.14215,
+            "1502": 1.13842,
+            "1503": 1.13883,
+            "1504": 1.1496,
+            "1505": 1.14028,
+            "1506": 1.13931,
+            "1507": 1.13949,
+            "1508": 1.14063,
+            "1509": 1.13913,
+            "1510": 1.1402,
+            "1511": 1.13931,
+            "1512": 1.13839,
+            "1513": 1.13771,
+            "1514": 1.13848,
+            "1515": 1.13796,
+            "1516": 1.13782,
+            "1517": 1.13889,
+            "1518": 1.13716,
+            "1519": 1.13908,
+            "1520": 1.13972,
+            "1521": 1.13966,
+            "1522": 1.13875,
+            "1523": 1.15781,
+            "1524": 1.15885,
+            "1525": 1.15802,
+            "1526": 1.14191,
+            "1527": 1.14054,
+            "1528": 1.1385,
+            "1529": 1.13922,
+            "1530": 1.12994,
+            "1531": 1.12552,
+            "1532": 1.27166,
+            "1533": 1.12707,
+            "1534": 1.12638,
+            "1535": 1.12608,
+            "1536": 1.12654,
+            "1537": 1.12511,
+            "1538": 1.16008,
+            "1539": 1.13169,
+            "1540": 1.13294,
+            "1541": 1.13386,
+            "1542": 1.13461,
+            "1543": 1.13337,
+            "1544": 1.1331,
+            "1545": 1.13294,
+            "1546": 1.13283,
+            "1547": 1.13316,
+            "1548": 1.13651,
+            "1549": 1.13626,
+            "1550": 1.13638,
+            "1551": 1.13187,
+            "1552": 1.20522,
+            "1553": 1.15894,
+            "1554": 1.14738,
+            "1555": 1.14563,
+            "1556": 1.14409,
+            "1557": 1.15018,
+            "1558": 1.14323,
+            "1559": 1.14591,
+            "1560": 1.14645,
+            "1561": 1.14673,
+            "1562": 1.14543,
+            "1563": 1.14518,
+            "1564": 1.14589,
+            "1565": 1.14486,
+            "1566": 1.14436,
+            "1567": 1.14357,
+            "1568": 1.1454,
+            "1569": 1.14493,
+            "1570": 1.14347,
+            "1571": 1.14477,
+            "1572": 1.14203,
+            "1573": 1.14441,
+            "1574": 1.14468,
+            "1575": 1.14607,
+            "1576": 1.14532,
+            "1577": 1.14389,
+            "1578": 1.1433,
+            "1579": 1.14321,
+            "1580": 1.14391,
+            "1581": 1.1421,
+            "1582": 1.14368,
+            "1583": 1.1444,
+            "1584": 1.14356,
+            "1585": 1.14875,
+            "1586": 1.14497,
+            "1587": 1.14521,
+            "1588": 1.14708,
+            "1589": 1.14631,
+            "1590": 1.14662,
+            "1591": 1.14949,
+            "1592": 1.15354,
+            "1593": 1.14014,
+            "1594": 1.1408,
+            "1595": 1.14166,
+            "1596": 1.14151,
+            "1597": 1.14228,
+            "1598": 1.14126,
+            "1599": 1.14028,
+            "1600": 1.14528,
+            "1601": 1.14125,
+            "1602": 1.14085,
+            "1603": 1.13862,
+            "1604": 1.13487,
+            "1605": 1.13314,
+            "1606": 1.13467,
+            "1607": 1.13153,
+            "1608": 1.12971,
+            "1609": 1.13044,
+            "1610": 1.14013,
+            "1611": 1.13008,
+            "1612": 1.13161,
+            "1613": 1.13128,
+            "1614": 1.13059,
+            "1615": 1.13169,
+            "1616": 1.13043,
+            "1617": 1.13141,
+            "1618": 1.12976,
+            "1619": 1.13071,
+            "1620": 1.12907,
+            "1621": 1.13138,
+            "1622": 1.12994,
+            "1623": 1.12985,
+            "1624": 1.12999,
+            "1625": 1.13035,
+            "1626": 1.13761,
+            "1627": 1.13703,
+            "1628": 1.15487,
+            "1629": 1.13257,
+            "1630": 1.13549,
+            "1631": 1.13358,
+            "1632": 1.13488,
+            "1633": 1.13601,
+            "1634": 1.13282,
+            "1635": 1.13439,
+            "1636": 1.13078,
+            "1637": 1.13147,
+            "1638": 1.13065,
+            "1639": 1.13181,
+            "1640": 1.13227,
+            "1641": 1.13282,
+            "1642": 1.13305,
+            "1643": 1.19491,
+            "1644": 1.15821,
+            "1645": 1.15349,
+            "1646": 1.1437,
+            "1647": 1.1416,
+            "1648": 1.14282,
+            "1649": 1.1408,
+            "1650": 1.13388,
+            "1651": 1.13396,
+            "1652": 1.15414,
+            "1653": 1.13734,
+            "1654": 1.13143,
+            "1655": 1.13124,
+            "1656": 1.13417,
+            "1657": 1.13376,
+            "1658": 1.12932,
+            "1659": 1.13161,
+            "1660": 1.13178,
+            "1661": 1.1315,
+            "1662": 1.13209,
+            "1663": 1.13118,
+            "1664": 1.13332,
+            "1665": 1.12981,
+            "1666": 1.13001,
+            "1667": 1.12943,
+            "1668": 1.12938,
+            "1669": 1.12973,
+            "1670": 1.13031,
+            "1671": 1.14164,
+            "1672": 1.14108,
+            "1673": 1.14165,
+            "1674": 1.14189,
+            "1675": 1.14174,
+            "1676": 1.14802,
+            "1677": 1.14434,
+            "1678": 1.14543,
+            "1679": 1.14285,
+            "1680": 1.14529,
+            "1681": 1.14548,
+            "1682": 1.14333,
+            "1683": 1.14553,
+            "1684": 1.14327,
+            "1685": 1.1476,
+            "1686": 1.1406,
+            "1687": 1.13769,
+            "1688": 1.13364,
+            "1689": 1.13418,
+            "1690": 1.13026,
+            "1691": 1.13222,
+            "1692": 1.13195,
+            "1693": 1.13247,
+            "1694": 1.13264,
+            "1695": 1.13167,
+            "1696": 1.13234,
+            "1697": 1.13335,
+            "1698": 1.13463,
+            "1699": 1.1337,
+            "1700": 1.13362,
+            "1701": 1.13339,
+            "1702": 1.13335,
+            "1703": 1.13412,
+            "1704": 1.1332,
+            "1705": 1.13109,
+            "1706": 1.13306,
+            "1707": 1.42699,
+            "1708": 1.14258,
+            "1709": 1.13227,
+            "1710": 1.13333,
+            "1711": 1.13316,
+            "1712": 1.13147,
+            "1713": 1.1325,
+            "1714": 1.13279,
+            "1715": 1.13509,
+            "1716": 1.132,
+            "1717": 1.13183,
+            "1718": 1.13123,
+            "1719": 1.13209,
+            "1720": 1.13195,
+            "1721": 1.12891,
+            "1722": 1.12633,
+            "1723": 1.12872,
+            "1724": 1.1269,
+            "1725": 1.12641,
+            "1726": 1.12585,
+            "1727": 1.12446,
+            "1728": 1.12583,
+            "1729": 1.1336,
+            "1730": 1.1322,
+            "1731": 1.13153,
+            "1732": 1.132,
+            "1733": 1.13239,
+            "1734": 1.13216,
+            "1735": 1.13252,
+            "1736": 1.13132,
+            "1737": 1.13165,
+            "1738": 1.13359,
+            "1739": 1.126,
+            "1740": 1.124,
+            "1741": 1.12533,
+            "1742": 1.12379,
+            "1743": 1.12474,
+            "1744": 1.12432,
+            "1745": 1.13505,
+            "1746": 1.13795,
+            "1747": 1.13914,
+            "1748": 1.17805,
+            "1749": 1.13962,
+            "1750": 1.13602,
+            "1751": 1.13778,
+            "1752": 1.13639,
+            "1753": 1.14452,
+            "1754": 1.14424,
+            "1755": 1.14388,
+            "1756": 1.14572,
+            "1757": 1.17074,
+            "1758": 1.14596,
+            "1759": 1.14637,
+            "1760": 1.14576,
+            "1761": 1.1441,
+            "1762": 1.13385,
+            "1763": 1.13833,
+            "1764": 1.13995,
+            "1765": 1.14229,
+            "1766": 1.2706,
+            "1767": 1.15999,
+            "1768": 1.13873,
+            "1769": 1.1421,
+            "1770": 1.13078,
+            "1771": 1.13059,
+            "1772": 1.13076,
+            "1773": 1.13527,
+            "1774": 1.13153,
+            "1775": 1.1299,
+            "1776": 1.13144,
+            "1777": 1.13048,
+            "1778": 1.1312,
+            "1779": 1.13109,
+            "1780": 1.13227,
+            "1781": 1.1318,
+            "1782": 1.13195,
+            "1783": 1.13076,
+            "1784": 1.13371,
+            "1785": 1.13513,
+            "1786": 1.13544,
+            "1787": 1.13286,
+            "1788": 1.13114,
+            "1789": 1.12859,
+            "1790": 1.13136,
+            "1791": 1.13775,
+            "1792": 1.1401,
+            "1793": 1.13769,
+            "1794": 1.13564,
+            "1795": 1.13638,
+            "1796": 1.13621,
+            "1797": 1.13614,
+            "1798": 1.13707,
+            "1799": 1.13631,
+            "1800": 1.13547,
+            "1801": 1.13673,
+            "1802": 1.13706,
+            "1803": 1.13765,
+            "1804": 1.13506,
+            "1805": 1.13603,
+            "1806": 1.13717,
+            "1807": 1.13637,
+            "1808": 1.13841,
+            "1809": 1.13734,
+            "1810": 1.1379,
+            "1811": 1.13795,
+            "1812": 1.13826,
+            "1813": 1.13875,
+            "1814": 1.13885,
+            "1815": 1.13773,
+            "1816": 1.13726,
+            "1817": 1.14087,
+            "1818": 1.1378,
+            "1819": 1.13714,
+            "1820": 1.13737,
+            "1821": 1.13928,
+            "1822": 1.1371,
+            "1823": 1.13901,
+            "1824": 1.14485,
+            "1825": 1.12803,
+            "1826": 1.12264,
+            "1827": 1.12651,
+            "1828": 1.13421,
+            "1829": 1.13198,
+            "1830": 1.13242,
+            "1831": 1.13488,
+            "1832": 1.13287,
+            "1833": 1.13394,
+            "1834": 1.13403,
+            "1835": 1.13598,
+            "1836": 1.13357,
+            "1837": 1.13518,
+            "1838": 1.13404,
+            "1839": 1.13577,
+            "1840": 1.13254,
+            "1841": 1.13422,
+            "1842": 1.13496,
+            "1843": 1.135,
+            "1844": 1.13791,
+            "1845": 1.13082,
+            "1846": 1.13135,
+            "1847": 1.13026,
+            "1848": 1.13098,
+            "1849": 1.13032,
+            "1850": 1.13038,
+            "1851": 1.13107,
+            "1852": 1.13535,
+            "1853": 1.1311,
+            "1854": 1.13935,
+            "1855": 1.13148,
+            "1856": 1.13042,
+            "1857": 1.13238,
+            "1858": 1.13034,
+            "1859": 1.13083,
+            "1860": 1.13262,
+            "1861": 1.13117,
+            "1862": 1.13181,
+            "1863": 1.13237,
+            "1864": 1.13125,
+            "1865": 1.13519,
+            "1866": 1.14006,
+            "1867": 1.13476,
+            "1868": 1.13101,
+            "1869": 1.13227,
+            "1870": 1.13399,
+            "1871": 1.13455,
+            "1872": 1.13237,
+            "1873": 1.13088,
+            "1874": 1.13163,
+            "1875": 1.13336,
+            "1876": 1.13121,
+            "1877": 1.13209,
+            "1878": 1.13199,
+            "1879": 1.13177,
+            "1880": 1.13322,
+            "1881": 1.13141,
+            "1882": 1.13236,
+            "1883": 1.12859,
+            "1884": 1.12504,
+            "1885": 1.12493,
+            "1886": 1.12502,
+            "1887": 1.12484,
+            "1888": 1.1248,
+            "1889": 1.12719,
+            "1890": 1.13286,
+            "1891": 1.1293,
+            "1892": 1.13422,
+            "1893": 1.12646,
+            "1894": 1.12508,
+            "1895": 1.12422,
+            "1896": 1.12724,
+            "1897": 1.12903,
+            "1898": 1.13203,
+            "1899": 1.12741,
+            "1900": 1.12527,
+            "1901": 1.12359,
+            "1902": 1.12382,
+            "1903": 1.12536,
+            "1904": 1.12683,
+            "1905": 1.12606,
+            "1906": 1.12607,
+            "1907": 1.12626,
+            "1908": 1.44717,
+            "1909": 1.12543,
+            "1910": 1.12376,
+            "1911": 1.12429,
+            "1912": 1.12442,
+            "1913": 1.12355,
+            "1914": 1.12476,
+            "1915": 1.12331,
+            "1916": 1.12342,
+            "1917": 1.12442,
+            "1918": 1.12472,
+            "1919": 1.12536,
+            "1920": 1.12387,
+            "1921": 1.12347,
+            "1922": 1.12561,
+            "1923": 1.12391,
+            "1924": 1.12342,
+            "1925": 1.12607,
+            "1926": 1.12383,
+            "1927": 1.12305,
+            "1928": 1.125,
+            "1929": 1.12399,
+            "1930": 1.1237,
+            "1931": 1.12459,
+            "1932": 1.12475,
+            "1933": 1.12278,
+            "1934": 1.12413,
+            "1935": 1.12588,
+            "1936": 1.12473,
+            "1937": 1.12412,
+            "1938": 1.12444,
+            "1939": 1.12303,
+            "1940": 1.12421,
+            "1941": 1.12404,
+            "1942": 1.12568,
+            "1943": 1.12645,
+            "1944": 1.12388,
+            "1945": 1.44561,
+            "1946": 1.12748,
+            "1947": 1.44404,
+            "1948": 1.12309,
+            "1949": 1.12591,
+            "1950": 1.124,
+            "1951": 1.12953,
+            "1952": 1.12429,
+            "1953": 1.48105,
+            "1954": 1.12576,
+            "1955": 1.1274,
+            "1956": 1.12693,
+            "1957": 1.1261,
+            "1958": 1.1276,
+            "1959": 1.18913,
+            "1960": 1.12817,
+            "1961": 1.12615,
+            "1962": 1.12581,
+            "1963": 1.12682,
+            "1964": 1.12747,
+            "1965": 1.14301,
+            "1966": 1.14417,
+            "1967": 1.14427,
+            "1968": 1.14017,
+            "1969": 1.13872,
+            "1970": 1.13824,
+            "1971": 1.14731,
+            "1972": 1.13727,
+            "1973": 1.13816,
+            "1974": 1.13684,
+            "1975": 1.13985,
+            "1976": 1.13777,
+            "1977": 1.13833,
+            "1978": 1.14247,
+            "1979": 1.14554,
+            "1980": 1.14074,
+            "1981": 1.1396,
+            "1982": 1.13784,
+            "1983": 1.19896,
+            "1984": 1.13952,
+            "1985": 1.13865,
+            "1986": 1.13959,
+            "1987": 1.13909,
+            "1988": 1.13875,
+            "1989": 1.13947,
+            "1990": 1.13762,
+            "1991": 1.13799,
+            "1992": 1.13904,
+            "1993": 1.13674,
+            "1994": 1.13869,
+            "1995": 1.13884,
+            "1996": 1.13807,
+            "1997": 1.13986,
+            "1998": 1.14151,
+            "1999": 1.13582,
+            "2000": 1.16726
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..b6e543e2cf8
--- /dev/null
+++ b/tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_h100_mcore_tp4_cp2_native_fp8_tp_sp_cp_tp_overlap/golden_values_dev_dgx_h100.json
@@ -0,0 +1,10037 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 10.85229,
+            "2": 10.85951,
+            "3": 10.85469,
+            "4": 10.86843,
+            "5": 10.85304,
+            "6": 10.85362,
+            "7": 10.8602,
+            "8": 10.85298,
+            "9": 10.84874,
+            "10": 10.84674,
+            "11": 10.83863,
+            "12": 10.83549,
+            "13": 10.82524,
+            "14": 10.84078,
+            "15": 10.78613,
+            "16": 10.79372,
+            "17": 10.76553,
+            "18": 10.78902,
+            "19": 10.73057,
+            "20": 10.69489,
+            "21": 10.64595,
+            "22": 10.64791,
+            "23": 10.65524,
+            "24": 10.55349,
+            "25": 10.56424,
+            "26": 10.63262,
+            "27": 10.47084,
+            "28": 10.471,
+            "29": 10.36495,
+            "30": 10.27406,
+            "31": 10.43126,
+            "32": 10.35361,
+            "33": 10.22439,
+            "34": 10.17135,
+            "35": 10.23744,
+            "36": 10.15766,
+            "37": 10.10704,
+            "38": 10.03631,
+            "39": 10.04895,
+            "40": 10.06978,
+            "41": 9.95276,
+            "42": 9.95577,
+            "43": 9.87217,
+            "44": 9.99154,
+            "45": 10.00766,
+            "46": 9.84803,
+            "47": 10.00018,
+            "48": 9.81816,
+            "49": 9.94941,
+            "50": 9.94449,
+            "51": 9.5964,
+            "52": 9.79483,
+            "53": 9.63207,
+            "54": 9.8854,
+            "55": 9.74063,
+            "56": 9.85006,
+            "57": 9.86123,
+            "58": 9.87737,
+            "59": 9.54716,
+            "60": 9.64756,
+            "61": 9.87994,
+            "62": 9.76465,
+            "63": 9.68066,
+            "64": 9.82801,
+            "65": 9.59733,
+            "66": 9.62928,
+            "67": 9.74212,
+            "68": 9.60593,
+            "69": 9.29694,
+            "70": 9.42495,
+            "71": 9.79013,
+            "72": 9.71358,
+            "73": 9.61909,
+            "74": 9.45334,
+            "75": 9.24289,
+            "76": 9.50821,
+            "77": 9.57857,
+            "78": 9.56035,
+            "79": 9.31048,
+            "80": 9.36161,
+            "81": 9.46136,
+            "82": 9.55628,
+            "83": 9.53353,
+            "84": 9.35526,
+            "85": 9.40111,
+            "86": 9.65137,
+            "87": 9.23621,
+            "88": 9.48942,
+            "89": 9.22457,
+            "90": 9.41443,
+            "91": 9.39014,
+            "92": 9.3793,
+            "93": 9.36366,
+            "94": 9.51552,
+            "95": 9.42012,
+            "96": 9.33698,
+            "97": 9.20729,
+            "98": 9.49265,
+            "99": 9.29333,
+            "100": 9.35883,
+            "101": 9.24766,
+            "102": 9.24259,
+            "103": 9.07796,
+            "104": 9.16832,
+            "105": 9.37671,
+            "106": 9.15179,
+            "107": 9.17832,
+            "108": 9.31483,
+            "109": 9.28984,
+            "110": 9.36705,
+            "111": 9.17605,
+            "112": 9.23281,
+            "113": 9.35413,
+            "114": 9.35742,
+            "115": 9.32337,
+            "116": 9.00364,
+            "117": 9.06445,
+            "118": 9.06523,
+            "119": 9.22504,
+            "120": 9.08324,
+            "121": 9.19428,
+            "122": 9.14006,
+            "123": 9.25894,
+            "124": 9.45689,
+            "125": 9.21857,
+            "126": 9.0614,
+            "127": 9.01413,
+            "128": 9.22025,
+            "129": 8.98394,
+            "130": 9.14098,
+            "131": 9.15643,
+            "132": 9.03479,
+            "133": 8.86261,
+            "134": 9.18468,
+            "135": 8.88922,
+            "136": 9.1645,
+            "137": 9.15944,
+            "138": 9.23186,
+            "139": 9.08834,
+            "140": 8.87267,
+            "141": 9.29752,
+            "142": 9.19877,
+            "143": 9.12079,
+            "144": 9.24324,
+            "145": 9.10527,
+            "146": 8.98338,
+            "147": 8.9881,
+            "148": 9.1361,
+            "149": 9.06877,
+            "150": 9.01122,
+            "151": 8.93192,
+            "152": 8.87852,
+            "153": 9.06711,
+            "154": 9.1802,
+            "155": 9.13786,
+            "156": 9.05095,
+            "157": 9.15163,
+            "158": 9.05301,
+            "159": 9.03638,
+            "160": 8.89244,
+            "161": 9.04764,
+            "162": 8.89639,
+            "163": 8.84472,
+            "164": 8.97496,
+            "165": 8.93105,
+            "166": 8.65677,
+            "167": 8.83411,
+            "168": 8.8203,
+            "169": 8.65961,
+            "170": 9.04726,
+            "171": 8.72167,
+            "172": 8.82105,
+            "173": 8.91105,
+            "174": 8.85007,
+            "175": 8.70985,
+            "176": 8.7611,
+            "177": 8.76567,
+            "178": 8.72394,
+            "179": 8.64132,
+            "180": 8.74357,
+            "181": 8.6941,
+            "182": 8.72315,
+            "183": 9.08667,
+            "184": 8.60959,
+            "185": 8.88334,
+            "186": 8.74346,
+            "187": 8.57546,
+            "188": 8.6841,
+            "189": 8.86656,
+            "190": 8.53754,
+            "191": 8.66593,
+            "192": 8.61152,
+            "193": 8.5763,
+            "194": 8.75183,
+            "195": 8.5938,
+            "196": 8.7761,
+            "197": 8.744,
+            "198": 8.63042,
+            "199": 8.77202,
+            "200": 8.73627,
+            "201": 8.67068,
+            "202": 8.55099,
+            "203": 8.54134,
+            "204": 8.71213,
+            "205": 8.22486,
+            "206": 8.85986,
+            "207": 8.67928,
+            "208": 8.70826,
+            "209": 8.75243,
+            "210": 8.58226,
+            "211": 8.84167,
+            "212": 8.4913,
+            "213": 8.57316,
+            "214": 8.51316,
+            "215": 8.56549,
+            "216": 8.50617,
+            "217": 8.53369,
+            "218": 8.53635,
+            "219": 8.64298,
+            "220": 8.54526,
+            "221": 8.39761,
+            "222": 8.50474,
+            "223": 8.44078,
+            "224": 8.52901,
+            "225": 8.5708,
+            "226": 8.44247,
+            "227": 8.67823,
+            "228": 8.3859,
+            "229": 8.4537,
+            "230": 8.4985,
+            "231": 8.50257,
+            "232": 8.49898,
+            "233": 8.49438,
+            "234": 8.64018,
+            "235": 8.5617,
+            "236": 8.39791,
+            "237": 8.49075,
+            "238": 8.30637,
+            "239": 8.56099,
+            "240": 8.67125,
+            "241": 8.447,
+            "242": 8.47179,
+            "243": 8.51685,
+            "244": 8.36975,
+            "245": 8.59641,
+            "246": 8.59557,
+            "247": 8.43962,
+            "248": 8.50986,
+            "249": 8.52277,
+            "250": 8.42301,
+            "251": 8.3783,
+            "252": 8.54698,
+            "253": 8.3164,
+            "254": 8.35246,
+            "255": 8.29609,
+            "256": 8.20858,
+            "257": 8.39462,
+            "258": 8.45148,
+            "259": 8.23213,
+            "260": 8.24039,
+            "261": 8.23733,
+            "262": 8.34866,
+            "263": 8.30632,
+            "264": 8.1907,
+            "265": 8.33202,
+            "266": 8.2336,
+            "267": 7.9013,
+            "268": 8.37861,
+            "269": 8.40384,
+            "270": 8.26475,
+            "271": 8.27885,
+            "272": 8.31844,
+            "273": 8.13253,
+            "274": 8.09818,
+            "275": 8.00901,
+            "276": 7.92522,
+            "277": 8.23699,
+            "278": 8.04701,
+            "279": 7.96356,
+            "280": 7.75515,
+            "281": 8.10016,
+            "282": 8.14722,
+            "283": 8.15666,
+            "284": 8.10022,
+            "285": 8.06894,
+            "286": 7.90037,
+            "287": 7.99127,
+            "288": 8.24359,
+            "289": 8.17176,
+            "290": 8.12684,
+            "291": 8.25357,
+            "292": 8.0756,
+            "293": 8.11914,
+            "294": 7.97501,
+            "295": 7.96533,
+            "296": 8.23576,
+            "297": 7.79081,
+            "298": 8.04236,
+            "299": 7.93831,
+            "300": 7.8498,
+            "301": 8.00964,
+            "302": 7.94515,
+            "303": 7.99053,
+            "304": 7.95899,
+            "305": 7.9946,
+            "306": 7.9738,
+            "307": 7.98707,
+            "308": 7.9953,
+            "309": 8.0059,
+            "310": 7.97168,
+            "311": 7.92562,
+            "312": 7.88182,
+            "313": 7.82955,
+            "314": 7.82035,
+            "315": 7.82475,
+            "316": 7.74495,
+            "317": 7.92567,
+            "318": 7.97631,
+            "319": 7.82443,
+            "320": 7.563,
+            "321": 7.74534,
+            "322": 7.82917,
+            "323": 7.76703,
+            "324": 7.90668,
+            "325": 7.79387,
+            "326": 7.64901,
+            "327": 7.86137,
+            "328": 7.7832,
+            "329": 7.87669,
+            "330": 7.74815,
+            "331": 7.52005,
+            "332": 7.81037,
+            "333": 7.8379,
+            "334": 7.67759,
+            "335": 7.69435,
+            "336": 7.90998,
+            "337": 7.64618,
+            "338": 7.89178,
+            "339": 7.7192,
+            "340": 7.75318,
+            "341": 7.70375,
+            "342": 7.81451,
+            "343": 7.61028,
+            "344": 7.58433,
+            "345": 7.60474,
+            "346": 7.45825,
+            "347": 7.55021,
+            "348": 7.67669,
+            "349": 7.57925,
+            "350": 7.65118,
+            "351": 7.74172,
+            "352": 7.69877,
+            "353": 7.4955,
+            "354": 7.73645,
+            "355": 7.75823,
+            "356": 7.76871,
+            "357": 7.8083,
+            "358": 7.59223,
+            "359": 7.54129,
+            "360": 7.62161,
+            "361": 7.53913,
+            "362": 7.75707,
+            "363": 7.58184,
+            "364": 7.57393,
+            "365": 7.61381,
+            "366": 7.30007,
+            "367": 7.55433,
+            "368": 7.4381,
+            "369": 7.34072,
+            "370": 7.45786,
+            "371": 7.45479,
+            "372": 7.64528,
+            "373": 7.51803,
+            "374": 7.43579,
+            "375": 7.52279,
+            "376": 7.33856,
+            "377": 7.23275,
+            "378": 7.53208,
+            "379": 7.48549,
+            "380": 7.37893,
+            "381": 7.46259,
+            "382": 7.28593,
+            "383": 7.26774,
+            "384": 7.4035,
+            "385": 7.38617,
+            "386": 7.2246,
+            "387": 7.41197,
+            "388": 7.27354,
+            "389": 7.42884,
+            "390": 7.23295,
+            "391": 7.63854,
+            "392": 7.32743,
+            "393": 7.41119,
+            "394": 7.46811,
+            "395": 7.43164,
+            "396": 7.27624,
+            "397": 7.22237,
+            "398": 7.41314,
+            "399": 7.14965,
+            "400": 7.28882,
+            "401": 7.34645,
+            "402": 7.38389,
+            "403": 7.27445,
+            "404": 7.29549,
+            "405": 7.25441,
+            "406": 7.20955,
+            "407": 7.35305,
+            "408": 7.17476,
+            "409": 7.15738,
+            "410": 7.30843,
+            "411": 7.21046,
+            "412": 7.19143,
+            "413": 7.22421,
+            "414": 6.90584,
+            "415": 7.32329,
+            "416": 7.41955,
+            "417": 7.01436,
+            "418": 7.26656,
+            "419": 7.03251,
+            "420": 7.40294,
+            "421": 7.17304,
+            "422": 7.22884,
+            "423": 7.08611,
+            "424": 7.2354,
+            "425": 7.3087,
+            "426": 7.28003,
+            "427": 7.12262,
+            "428": 7.08425,
+            "429": 6.87125,
+            "430": 7.19779,
+            "431": 6.99763,
+            "432": 7.22298,
+            "433": 6.96906,
+            "434": 6.95232,
+            "435": 7.01097,
+            "436": 7.00141,
+            "437": 6.9848,
+            "438": 6.99447,
+            "439": 6.93128,
+            "440": 7.05472,
+            "441": 7.03406,
+            "442": 7.09324,
+            "443": 7.0854,
+            "444": 6.69941,
+            "445": 6.98741,
+            "446": 7.13474,
+            "447": 7.11726,
+            "448": 6.97509,
+            "449": 7.04203,
+            "450": 7.00855,
+            "451": 6.82317,
+            "452": 6.90281,
+            "453": 7.00796,
+            "454": 6.96028,
+            "455": 7.02393,
+            "456": 6.98781,
+            "457": 6.96156,
+            "458": 6.89735,
+            "459": 6.68323,
+            "460": 7.05439,
+            "461": 7.088,
+            "462": 6.86315,
+            "463": 7.04576,
+            "464": 6.64275,
+            "465": 7.02272,
+            "466": 6.99895,
+            "467": 6.99097,
+            "468": 6.94728,
+            "469": 6.82004,
+            "470": 7.0355,
+            "471": 6.87321,
+            "472": 6.95214,
+            "473": 6.81396,
+            "474": 6.96547,
+            "475": 7.1584,
+            "476": 6.75391,
+            "477": 6.88861,
+            "478": 6.89832,
+            "479": 6.69636,
+            "480": 7.01803,
+            "481": 6.98503,
+            "482": 6.72248,
+            "483": 6.77484,
+            "484": 6.74297,
+            "485": 6.92045,
+            "486": 7.05544,
+            "487": 6.62222,
+            "488": 6.87375,
+            "489": 6.76024,
+            "490": 6.81377,
+            "491": 6.69837,
+            "492": 6.68149,
+            "493": 6.75646,
+            "494": 6.66282,
+            "495": 6.62263,
+            "496": 6.57706,
+            "497": 6.8292,
+            "498": 6.63548,
+            "499": 6.84385,
+            "500": 6.64283,
+            "501": 6.71966,
+            "502": 6.82988,
+            "503": 6.69833,
+            "504": 6.60751,
+            "505": 6.6112,
+            "506": 6.73586,
+            "507": 6.85391,
+            "508": 6.84629,
+            "509": 6.6384,
+            "510": 6.81034,
+            "511": 6.72977,
+            "512": 6.72804,
+            "513": 6.64821,
+            "514": 6.70064,
+            "515": 6.43824,
+            "516": 6.73421,
+            "517": 6.69542,
+            "518": 6.52993,
+            "519": 6.62474,
+            "520": 6.84935,
+            "521": 6.65329,
+            "522": 6.6979,
+            "523": 6.73262,
+            "524": 6.72634,
+            "525": 6.6655,
+            "526": 6.40663,
+            "527": 6.79088,
+            "528": 6.65206,
+            "529": 6.62295,
+            "530": 6.61639,
+            "531": 6.63503,
+            "532": 6.62382,
+            "533": 6.75435,
+            "534": 6.60296,
+            "535": 6.74138,
+            "536": 6.61812,
+            "537": 6.63086,
+            "538": 6.52418,
+            "539": 6.54299,
+            "540": 6.57593,
+            "541": 6.44382,
+            "542": 6.66189,
+            "543": 6.67325,
+            "544": 6.66927,
+            "545": 6.80511,
+            "546": 6.6246,
+            "547": 6.40979,
+            "548": 6.71663,
+            "549": 6.68986,
+            "550": 6.51987,
+            "551": 6.74092,
+            "552": 6.63227,
+            "553": 6.47534,
+            "554": 6.62778,
+            "555": 6.45222,
+            "556": 6.60749,
+            "557": 6.62431,
+            "558": 6.37676,
+            "559": 6.36118,
+            "560": 6.5756,
+            "561": 6.72381,
+            "562": 6.62768,
+            "563": 6.73287,
+            "564": 6.34176,
+            "565": 6.50706,
+            "566": 6.6902,
+            "567": 6.55838,
+            "568": 6.50084,
+            "569": 6.44415,
+            "570": 6.35619,
+            "571": 6.62259,
+            "572": 6.30471,
+            "573": 6.5721,
+            "574": 6.46259,
+            "575": 6.63541,
+            "576": 6.50701,
+            "577": 6.51656,
+            "578": 6.47574,
+            "579": 6.45618,
+            "580": 6.5583,
+            "581": 6.59714,
+            "582": 6.46959,
+            "583": 6.50413,
+            "584": 6.51087,
+            "585": 6.41424,
+            "586": 6.40258,
+            "587": 6.4501,
+            "588": 6.55622,
+            "589": 6.61456,
+            "590": 6.27891,
+            "591": 6.66415,
+            "592": 6.2545,
+            "593": 6.46521,
+            "594": 6.37467,
+            "595": 6.34819,
+            "596": 6.25003,
+            "597": 6.18054,
+            "598": 6.44279,
+            "599": 6.38602,
+            "600": 6.44414,
+            "601": 6.25051,
+            "602": 6.51804,
+            "603": 6.50819,
+            "604": 6.37382,
+            "605": 6.48026,
+            "606": 6.3013,
+            "607": 6.51999,
+            "608": 6.66049,
+            "609": 6.16075,
+            "610": 6.55805,
+            "611": 6.38737,
+            "612": 6.56702,
+            "613": 6.41056,
+            "614": 6.18827,
+            "615": 6.38286,
+            "616": 6.34421,
+            "617": 6.36273,
+            "618": 6.43626,
+            "619": 6.12502,
+            "620": 6.3943,
+            "621": 6.44427,
+            "622": 6.38402,
+            "623": 6.56769,
+            "624": 6.34417,
+            "625": 6.26521,
+            "626": 6.28634,
+            "627": 6.4276,
+            "628": 6.24043,
+            "629": 6.57298,
+            "630": 6.3523,
+            "631": 6.33431,
+            "632": 6.29554,
+            "633": 6.24213,
+            "634": 6.29476,
+            "635": 6.53142,
+            "636": 6.23005,
+            "637": 6.62121,
+            "638": 6.00686,
+            "639": 6.26506,
+            "640": 6.2796,
+            "641": 6.19435,
+            "642": 6.27007,
+            "643": 6.44413,
+            "644": 6.2445,
+            "645": 6.23092,
+            "646": 6.38932,
+            "647": 6.3209,
+            "648": 6.34188,
+            "649": 6.33297,
+            "650": 6.47025,
+            "651": 6.31782,
+            "652": 6.23993,
+            "653": 6.36817,
+            "654": 6.43495,
+            "655": 6.5135,
+            "656": 6.31371,
+            "657": 6.4163,
+            "658": 6.22993,
+            "659": 6.1432,
+            "660": 6.3808,
+            "661": 6.15725,
+            "662": 6.2613,
+            "663": 6.36151,
+            "664": 6.32043,
+            "665": 6.39194,
+            "666": 6.15182,
+            "667": 6.18562,
+            "668": 6.22741,
+            "669": 6.20408,
+            "670": 6.23602,
+            "671": 6.22904,
+            "672": 6.47492,
+            "673": 6.32812,
+            "674": 6.28343,
+            "675": 6.37362,
+            "676": 6.38018,
+            "677": 6.29511,
+            "678": 6.26804,
+            "679": 6.22803,
+            "680": 6.28357,
+            "681": 6.19077,
+            "682": 6.07906,
+            "683": 6.26403,
+            "684": 6.31575,
+            "685": 6.2874,
+            "686": 6.14011,
+            "687": 6.27685,
+            "688": 6.19835,
+            "689": 6.61075,
+            "690": 6.16856,
+            "691": 6.17286,
+            "692": 6.2649,
+            "693": 6.13689,
+            "694": 6.22553,
+            "695": 6.31786,
+            "696": 6.1061,
+            "697": 6.14556,
+            "698": 6.21959,
+            "699": 6.45326,
+            "700": 6.03519,
+            "701": 6.05302,
+            "702": 6.23703,
+            "703": 6.17441,
+            "704": 6.20621,
+            "705": 6.11844,
+            "706": 6.06567,
+            "707": 6.24456,
+            "708": 6.30245,
+            "709": 5.99551,
+            "710": 6.15229,
+            "711": 6.2479,
+            "712": 6.17146,
+            "713": 5.88608,
+            "714": 6.09975,
+            "715": 6.10497,
+            "716": 6.40586,
+            "717": 6.18363,
+            "718": 6.23537,
+            "719": 6.26862,
+            "720": 6.25804,
+            "721": 6.25605,
+            "722": 6.22472,
+            "723": 6.07187,
+            "724": 6.22017,
+            "725": 6.0314,
+            "726": 6.29244,
+            "727": 6.00644,
+            "728": 6.03616,
+            "729": 6.0826,
+            "730": 6.17412,
+            "731": 6.09163,
+            "732": 6.07888,
+            "733": 6.11348,
+            "734": 6.37763,
+            "735": 6.26791,
+            "736": 6.17709,
+            "737": 6.36077,
+            "738": 6.13247,
+            "739": 6.14636,
+            "740": 5.87836,
+            "741": 6.00499,
+            "742": 5.98594,
+            "743": 6.17515,
+            "744": 6.02317,
+            "745": 6.14565,
+            "746": 6.03122,
+            "747": 6.09452,
+            "748": 6.22864,
+            "749": 5.93308,
+            "750": 6.16381,
+            "751": 5.95292,
+            "752": 6.01389,
+            "753": 6.02392,
+            "754": 6.28379,
+            "755": 6.12598,
+            "756": 6.2443,
+            "757": 6.01404,
+            "758": 6.19738,
+            "759": 6.22084,
+            "760": 6.02115,
+            "761": 6.1856,
+            "762": 6.21798,
+            "763": 6.02971,
+            "764": 5.95856,
+            "765": 5.92315,
+            "766": 5.96127,
+            "767": 5.81063,
+            "768": 6.18012,
+            "769": 6.27004,
+            "770": 6.28915,
+            "771": 5.78425,
+            "772": 6.0231,
+            "773": 6.17908,
+            "774": 5.87868,
+            "775": 6.02111,
+            "776": 6.12258,
+            "777": 5.875,
+            "778": 6.04901,
+            "779": 5.86583,
+            "780": 6.13275,
+            "781": 5.8451,
+            "782": 6.03644,
+            "783": 5.94982,
+            "784": 5.91239,
+            "785": 6.08718,
+            "786": 6.0949,
+            "787": 5.6498,
+            "788": 5.99117,
+            "789": 6.20208,
+            "790": 6.25533,
+            "791": 5.78584,
+            "792": 5.98398,
+            "793": 6.17232,
+            "794": 6.02303,
+            "795": 5.99758,
+            "796": 6.15575,
+            "797": 6.04799,
+            "798": 6.04773,
+            "799": 6.10394,
+            "800": 6.00523,
+            "801": 6.13976,
+            "802": 5.97143,
+            "803": 6.14303,
+            "804": 5.99897,
+            "805": 5.8162,
+            "806": 6.08016,
+            "807": 6.03933,
+            "808": 5.91779,
+            "809": 5.76774,
+            "810": 6.00748,
+            "811": 5.92407,
+            "812": 5.89853,
+            "813": 5.95603,
+            "814": 6.0199,
+            "815": 5.80113,
+            "816": 6.10732,
+            "817": 5.92704,
+            "818": 6.05349,
+            "819": 5.99954,
+            "820": 5.71925,
+            "821": 5.93871,
+            "822": 6.18742,
+            "823": 5.82051,
+            "824": 5.97479,
+            "825": 6.17898,
+            "826": 6.18992,
+            "827": 6.04811,
+            "828": 6.0618,
+            "829": 5.8808,
+            "830": 5.9338,
+            "831": 5.89066,
+            "832": 5.95946,
+            "833": 6.05775,
+            "834": 5.98694,
+            "835": 5.99225,
+            "836": 5.78808,
+            "837": 6.1001,
+            "838": 5.85774,
+            "839": 5.82603,
+            "840": 6.17451,
+            "841": 5.77389,
+            "842": 5.88244,
+            "843": 5.93827,
+            "844": 6.0037,
+            "845": 6.08214,
+            "846": 5.68388,
+            "847": 5.75348,
+            "848": 5.96075,
+            "849": 6.0909,
+            "850": 5.83839,
+            "851": 6.01221,
+            "852": 5.74277,
+            "853": 5.9819,
+            "854": 6.00994,
+            "855": 5.81104,
+            "856": 5.99027,
+            "857": 5.99462,
+            "858": 6.04349,
+            "859": 5.94378,
+            "860": 6.08776,
+            "861": 6.05806,
+            "862": 5.99259,
+            "863": 5.83184,
+            "864": 5.83727,
+            "865": 5.93014,
+            "866": 5.88373,
+            "867": 5.87071,
+            "868": 6.0603,
+            "869": 6.08011,
+            "870": 5.96321,
+            "871": 6.03762,
+            "872": 5.89053,
+            "873": 5.83933,
+            "874": 6.02181,
+            "875": 5.90658,
+            "876": 5.96303,
+            "877": 5.92074,
+            "878": 6.09702,
+            "879": 5.76213,
+            "880": 6.0073,
+            "881": 5.98795,
+            "882": 5.90217,
+            "883": 5.67039,
+            "884": 5.95748,
+            "885": 5.74054,
+            "886": 5.98445,
+            "887": 5.90648,
+            "888": 5.8314,
+            "889": 6.00733,
+            "890": 6.01123,
+            "891": 5.94286,
+            "892": 5.70277,
+            "893": 6.08459,
+            "894": 5.72165,
+            "895": 5.83588,
+            "896": 5.83978,
+            "897": 5.84943,
+            "898": 5.92347,
+            "899": 5.93201,
+            "900": 5.8958,
+            "901": 5.94689,
+            "902": 5.82987,
+            "903": 6.04738,
+            "904": 5.92586,
+            "905": 5.89894,
+            "906": 5.61575,
+            "907": 5.90522,
+            "908": 5.73333,
+            "909": 5.98526,
+            "910": 5.85686,
+            "911": 5.69844,
+            "912": 5.69856,
+            "913": 5.76407,
+            "914": 5.82436,
+            "915": 5.79681,
+            "916": 5.88608,
+            "917": 5.867,
+            "918": 5.8166,
+            "919": 5.80848,
+            "920": 5.88971,
+            "921": 5.8407,
+            "922": 5.62064,
+            "923": 6.03383,
+            "924": 5.60482,
+            "925": 5.61823,
+            "926": 5.85786,
+            "927": 5.95554,
+            "928": 5.83872,
+            "929": 5.82237,
+            "930": 5.95411,
+            "931": 5.75622,
+            "932": 5.59098,
+            "933": 5.63134,
+            "934": 5.80496,
+            "935": 5.63538,
+            "936": 5.8317,
+            "937": 5.96485,
+            "938": 5.58943,
+            "939": 5.79158,
+            "940": 5.96089,
+            "941": 5.72676,
+            "942": 5.83595,
+            "943": 5.87091,
+            "944": 5.95881,
+            "945": 5.70173,
+            "946": 5.55832,
+            "947": 5.74676,
+            "948": 5.79172,
+            "949": 5.82702,
+            "950": 5.84636,
+            "951": 5.72232,
+            "952": 5.6926,
+            "953": 5.67846,
+            "954": 5.72814,
+            "955": 5.52701,
+            "956": 5.6247,
+            "957": 5.84082,
+            "958": 5.79725,
+            "959": 5.57236,
+            "960": 5.8033,
+            "961": 5.83318,
+            "962": 5.76931,
+            "963": 5.768,
+            "964": 5.70825,
+            "965": 5.63755,
+            "966": 5.60344,
+            "967": 5.72795,
+            "968": 5.74037,
+            "969": 5.82565,
+            "970": 5.64868,
+            "971": 5.70857,
+            "972": 5.85255,
+            "973": 5.67308,
+            "974": 5.7177,
+            "975": 5.86027,
+            "976": 5.71074,
+            "977": 5.77363,
+            "978": 5.68598,
+            "979": 5.5901,
+            "980": 5.76431,
+            "981": 5.89808,
+            "982": 5.47164,
+            "983": 5.61909,
+            "984": 5.54693,
+            "985": 5.58914,
+            "986": 5.6395,
+            "987": 5.57215,
+            "988": 5.71212,
+            "989": 5.69568,
+            "990": 5.62713,
+            "991": 5.85071,
+            "992": 5.77178,
+            "993": 5.87182,
+            "994": 5.69827,
+            "995": 5.7311,
+            "996": 5.73947,
+            "997": 5.81776,
+            "998": 5.83946,
+            "999": 5.83213,
+            "1000": 5.68618,
+            "1001": 5.86902,
+            "1002": 5.75759,
+            "1003": 5.64206,
+            "1004": 5.80056,
+            "1005": 5.53357,
+            "1006": 5.3287,
+            "1007": 5.7697,
+            "1008": 5.79391,
+            "1009": 5.65438,
+            "1010": 5.78459,
+            "1011": 5.89696,
+            "1012": 5.62269,
+            "1013": 5.61367,
+            "1014": 5.67992,
+            "1015": 5.56146,
+            "1016": 5.87263,
+            "1017": 5.83169,
+            "1018": 5.62357,
+            "1019": 5.73336,
+            "1020": 5.61404,
+            "1021": 5.85353,
+            "1022": 5.49696,
+            "1023": 5.65062,
+            "1024": 5.74334,
+            "1025": 5.57222,
+            "1026": 5.40994,
+            "1027": 5.59905,
+            "1028": 5.68935,
+            "1029": 5.68346,
+            "1030": 5.68799,
+            "1031": 5.40526,
+            "1032": 5.78443,
+            "1033": 5.57561,
+            "1034": 5.6274,
+            "1035": 5.71529,
+            "1036": 5.62368,
+            "1037": 5.36621,
+            "1038": 5.66561,
+            "1039": 5.6477,
+            "1040": 5.57324,
+            "1041": 5.59731,
+            "1042": 5.81493,
+            "1043": 5.56271,
+            "1044": 5.46406,
+            "1045": 5.9683,
+            "1046": 5.48617,
+            "1047": 5.39181,
+            "1048": 5.49562,
+            "1049": 5.67791,
+            "1050": 5.69881,
+            "1051": 5.5776,
+            "1052": 5.68149,
+            "1053": 5.63114,
+            "1054": 5.45857,
+            "1055": 5.59887,
+            "1056": 5.67508,
+            "1057": 5.75628,
+            "1058": 5.56524,
+            "1059": 5.74843,
+            "1060": 5.82162,
+            "1061": 5.47233,
+            "1062": 5.65043,
+            "1063": 5.50248,
+            "1064": 5.59125,
+            "1065": 5.55564,
+            "1066": 5.74466,
+            "1067": 5.67043,
+            "1068": 5.44061,
+            "1069": 5.61122,
+            "1070": 5.81207,
+            "1071": 5.51069,
+            "1072": 5.62291,
+            "1073": 5.6192,
+            "1074": 5.52379,
+            "1075": 5.70748,
+            "1076": 5.5951,
+            "1077": 5.70681,
+            "1078": 5.56223,
+            "1079": 5.61677,
+            "1080": 5.64259,
+            "1081": 5.62201,
+            "1082": 5.50149,
+            "1083": 5.64213,
+            "1084": 5.55087,
+            "1085": 5.40393,
+            "1086": 5.62042,
+            "1087": 5.44171,
+            "1088": 5.51111,
+            "1089": 5.76887,
+            "1090": 5.52736,
+            "1091": 5.51307,
+            "1092": 5.40781,
+            "1093": 5.69672,
+            "1094": 5.56925,
+            "1095": 5.5731,
+            "1096": 5.61367,
+            "1097": 5.6454,
+            "1098": 5.65292,
+            "1099": 5.51436,
+            "1100": 5.63973,
+            "1101": 5.67989,
+            "1102": 5.53567,
+            "1103": 5.54943,
+            "1104": 5.53818,
+            "1105": 5.55271,
+            "1106": 5.68243,
+            "1107": 5.68309,
+            "1108": 5.78112,
+            "1109": 5.54014,
+            "1110": 5.6617,
+            "1111": 5.59215,
+            "1112": 5.58702,
+            "1113": 5.62687,
+            "1114": 5.61504,
+            "1115": 5.59863,
+            "1116": 5.66461,
+            "1117": 5.64732,
+            "1118": 5.65418,
+            "1119": 5.70846,
+            "1120": 5.63501,
+            "1121": 5.37809,
+            "1122": 5.23308,
+            "1123": 5.47298,
+            "1124": 5.65454,
+            "1125": 5.68419,
+            "1126": 5.68674,
+            "1127": 5.56954,
+            "1128": 5.62438,
+            "1129": 5.29406,
+            "1130": 5.54548,
+            "1131": 5.6238,
+            "1132": 5.72077,
+            "1133": 5.51615,
+            "1134": 5.55302,
+            "1135": 5.51992,
+            "1136": 5.42021,
+            "1137": 5.46757,
+            "1138": 5.5657,
+            "1139": 5.41524,
+            "1140": 5.26144,
+            "1141": 5.58424,
+            "1142": 5.64054,
+            "1143": 5.385,
+            "1144": 5.3823,
+            "1145": 5.36615,
+            "1146": 5.62886,
+            "1147": 5.49181,
+            "1148": 5.50478,
+            "1149": 5.51839,
+            "1150": 5.39997,
+            "1151": 5.5553,
+            "1152": 5.42174,
+            "1153": 5.4602,
+            "1154": 5.50372,
+            "1155": 5.44072,
+            "1156": 5.34868,
+            "1157": 5.66217,
+            "1158": 5.39889,
+            "1159": 5.33332,
+            "1160": 5.79511,
+            "1161": 5.53597,
+            "1162": 5.45589,
+            "1163": 5.52529,
+            "1164": 5.38319,
+            "1165": 5.52473,
+            "1166": 5.48721,
+            "1167": 5.36058,
+            "1168": 5.49334,
+            "1169": 5.40387,
+            "1170": 5.58667,
+            "1171": 5.48535,
+            "1172": 5.64049,
+            "1173": 5.62012,
+            "1174": 5.51308,
+            "1175": 5.34473,
+            "1176": 5.38256,
+            "1177": 5.55838,
+            "1178": 5.46714,
+            "1179": 5.49373,
+            "1180": 5.46571,
+            "1181": 5.55314,
+            "1182": 5.59825,
+            "1183": 5.76884,
+            "1184": 5.54748,
+            "1185": 5.28691,
+            "1186": 5.60427,
+            "1187": 5.55401,
+            "1188": 5.51546,
+            "1189": 5.38634,
+            "1190": 5.40233,
+            "1191": 5.38976,
+            "1192": 5.49689,
+            "1193": 5.46486,
+            "1194": 5.45443,
+            "1195": 5.32542,
+            "1196": 5.52268,
+            "1197": 5.47666,
+            "1198": 5.52589,
+            "1199": 5.38688,
+            "1200": 5.33164,
+            "1201": 5.49012,
+            "1202": 5.43748,
+            "1203": 5.49375,
+            "1204": 5.40666,
+            "1205": 5.48999,
+            "1206": 5.33478,
+            "1207": 5.58651,
+            "1208": 5.42414,
+            "1209": 5.2931,
+            "1210": 5.49969,
+            "1211": 5.5071,
+            "1212": 5.59732,
+            "1213": 5.41745,
+            "1214": 5.49785,
+            "1215": 5.23706,
+            "1216": 5.41194,
+            "1217": 5.38264,
+            "1218": 5.4506,
+            "1219": 5.48501,
+            "1220": 5.38351,
+            "1221": 5.4519,
+            "1222": 5.31254,
+            "1223": 5.47747,
+            "1224": 5.41418,
+            "1225": 5.42845,
+            "1226": 5.32249,
+            "1227": 5.47547,
+            "1228": 5.73249,
+            "1229": 5.32716,
+            "1230": 5.41211,
+            "1231": 5.07649,
+            "1232": 5.78792,
+            "1233": 5.28531,
+            "1234": 5.24399,
+            "1235": 5.36824,
+            "1236": 5.47881,
+            "1237": 5.20655,
+            "1238": 5.41404,
+            "1239": 5.40719,
+            "1240": 5.46621,
+            "1241": 5.57221,
+            "1242": 5.45465,
+            "1243": 5.43424,
+            "1244": 5.51633,
+            "1245": 5.19115,
+            "1246": 5.71566,
+            "1247": 5.43,
+            "1248": 5.29843,
+            "1249": 5.40246,
+            "1250": 5.34088,
+            "1251": 5.41904,
+            "1252": 5.57108,
+            "1253": 5.489,
+            "1254": 5.31099,
+            "1255": 5.51387,
+            "1256": 5.60708,
+            "1257": 5.42325,
+            "1258": 5.55956,
+            "1259": 5.47585,
+            "1260": 5.50779,
+            "1261": 5.63801,
+            "1262": 5.39496,
+            "1263": 5.32432,
+            "1264": 5.50348,
+            "1265": 5.30656,
+            "1266": 5.23675,
+            "1267": 5.37031,
+            "1268": 5.38615,
+            "1269": 5.14823,
+            "1270": 5.39882,
+            "1271": 5.27753,
+            "1272": 5.52297,
+            "1273": 5.29632,
+            "1274": 5.34638,
+            "1275": 5.37784,
+            "1276": 5.3975,
+            "1277": 5.4606,
+            "1278": 5.35501,
+            "1279": 5.43897,
+            "1280": 5.45708,
+            "1281": 5.4056,
+            "1282": 5.38482,
+            "1283": 5.42347,
+            "1284": 5.34377,
+            "1285": 5.50505,
+            "1286": 5.33544,
+            "1287": 5.58814,
+            "1288": 5.2615,
+            "1289": 5.42995,
+            "1290": 5.49991,
+            "1291": 5.49987,
+            "1292": 5.44631,
+            "1293": 5.4171,
+            "1294": 5.49492,
+            "1295": 5.34499,
+            "1296": 5.18358,
+            "1297": 5.16726,
+            "1298": 5.11761,
+            "1299": 5.30129,
+            "1300": 5.21142,
+            "1301": 5.30283,
+            "1302": 5.27612,
+            "1303": 5.35547,
+            "1304": 5.43158,
+            "1305": 5.36825,
+            "1306": 5.25293,
+            "1307": 5.19217,
+            "1308": 5.27071,
+            "1309": 5.40774,
+            "1310": 5.26053,
+            "1311": 5.37774,
+            "1312": 5.35324,
+            "1313": 5.29428,
+            "1314": 5.29224,
+            "1315": 5.41906,
+            "1316": 5.25856,
+            "1317": 5.27981,
+            "1318": 5.21136,
+            "1319": 5.34401,
+            "1320": 5.4177,
+            "1321": 5.44957,
+            "1322": 5.46219,
+            "1323": 5.37269,
+            "1324": 5.24973,
+            "1325": 5.40538,
+            "1326": 5.53891,
+            "1327": 5.38638,
+            "1328": 5.21164,
+            "1329": 5.41667,
+            "1330": 5.39695,
+            "1331": 5.30979,
+            "1332": 5.3112,
+            "1333": 5.36823,
+            "1334": 5.44451,
+            "1335": 5.36788,
+            "1336": 5.43552,
+            "1337": 5.46933,
+            "1338": 5.30246,
+            "1339": 5.1362,
+            "1340": 5.41205,
+            "1341": 5.34033,
+            "1342": 5.35625,
+            "1343": 5.47387,
+            "1344": 5.37842,
+            "1345": 5.34238,
+            "1346": 5.07927,
+            "1347": 5.38404,
+            "1348": 5.49312,
+            "1349": 5.40746,
+            "1350": 5.02698,
+            "1351": 5.31566,
+            "1352": 5.15947,
+            "1353": 5.3409,
+            "1354": 5.35878,
+            "1355": 5.11364,
+            "1356": 5.25842,
+            "1357": 5.28929,
+            "1358": 5.15831,
+            "1359": 5.10775,
+            "1360": 5.17385,
+            "1361": 5.30604,
+            "1362": 5.06672,
+            "1363": 5.29722,
+            "1364": 5.3953,
+            "1365": 5.01953,
+            "1366": 5.1147,
+            "1367": 5.33054,
+            "1368": 5.18248,
+            "1369": 5.22391,
+            "1370": 5.1961,
+            "1371": 5.27906,
+            "1372": 5.25988,
+            "1373": 5.28404,
+            "1374": 5.2779,
+            "1375": 5.46001,
+            "1376": 5.26713,
+            "1377": 5.26807,
+            "1378": 5.31427,
+            "1379": 5.22765,
+            "1380": 5.25807,
+            "1381": 5.47919,
+            "1382": 5.08739,
+            "1383": 5.37543,
+            "1384": 5.36108,
+            "1385": 5.39028,
+            "1386": 5.16582,
+            "1387": 5.16244,
+            "1388": 5.27616,
+            "1389": 5.30262,
+            "1390": 5.25131,
+            "1391": 5.26406,
+            "1392": 5.36794,
+            "1393": 5.37824,
+            "1394": 5.40104,
+            "1395": 5.32383,
+            "1396": 5.21137,
+            "1397": 5.2828,
+            "1398": 5.36587,
+            "1399": 5.35557,
+            "1400": 5.26522,
+            "1401": 5.35981,
+            "1402": 5.42507,
+            "1403": 5.19768,
+            "1404": 5.27957,
+            "1405": 5.11754,
+            "1406": 4.98933,
+            "1407": 5.39818,
+            "1408": 5.1921,
+            "1409": 5.39429,
+            "1410": 5.37153,
+            "1411": 4.91585,
+            "1412": 5.35244,
+            "1413": 5.41055,
+            "1414": 5.21699,
+            "1415": 5.44044,
+            "1416": 5.32598,
+            "1417": 5.39078,
+            "1418": 5.29894,
+            "1419": 5.31316,
+            "1420": 5.43638,
+            "1421": 5.39683,
+            "1422": 5.41859,
+            "1423": 4.99867,
+            "1424": 5.33177,
+            "1425": 5.58491,
+            "1426": 5.23068,
+            "1427": 5.31742,
+            "1428": 5.33463,
+            "1429": 5.07871,
+            "1430": 5.32748,
+            "1431": 5.32237,
+            "1432": 5.34216,
+            "1433": 5.18496,
+            "1434": 5.16175,
+            "1435": 5.20122,
+            "1436": 5.10715,
+            "1437": 5.22566,
+            "1438": 5.31423,
+            "1439": 5.34769,
+            "1440": 5.34295,
+            "1441": 5.16777,
+            "1442": 5.21935,
+            "1443": 5.20553,
+            "1444": 5.12984,
+            "1445": 5.07414,
+            "1446": 5.26456,
+            "1447": 5.25775,
+            "1448": 5.29302,
+            "1449": 5.24616,
+            "1450": 5.34316,
+            "1451": 5.07004,
+            "1452": 5.26796,
+            "1453": 5.1741,
+            "1454": 5.01458,
+            "1455": 5.12771,
+            "1456": 5.27213,
+            "1457": 5.1882,
+            "1458": 5.00695,
+            "1459": 5.2215,
+            "1460": 5.23955,
+            "1461": 5.08,
+            "1462": 4.97269,
+            "1463": 5.15114,
+            "1464": 5.22113,
+            "1465": 5.27344,
+            "1466": 5.36076,
+            "1467": 5.34631,
+            "1468": 5.2303,
+            "1469": 5.05117,
+            "1470": 5.12322,
+            "1471": 5.25302,
+            "1472": 5.12175,
+            "1473": 5.10167,
+            "1474": 5.21744,
+            "1475": 5.18613,
+            "1476": 5.15517,
+            "1477": 5.26215,
+            "1478": 5.30407,
+            "1479": 5.01063,
+            "1480": 5.182,
+            "1481": 5.25124,
+            "1482": 5.3494,
+            "1483": 5.27058,
+            "1484": 4.92644,
+            "1485": 5.29103,
+            "1486": 5.04435,
+            "1487": 4.88432,
+            "1488": 5.18325,
+            "1489": 5.10139,
+            "1490": 5.04545,
+            "1491": 5.3188,
+            "1492": 5.22283,
+            "1493": 4.94061,
+            "1494": 5.10891,
+            "1495": 5.13402,
+            "1496": 5.05779,
+            "1497": 5.36536,
+            "1498": 5.30609,
+            "1499": 5.143,
+            "1500": 5.09554,
+            "1501": 5.0349,
+            "1502": 5.15423,
+            "1503": 5.43131,
+            "1504": 5.32574,
+            "1505": 5.00836,
+            "1506": 5.14423,
+            "1507": 5.16501,
+            "1508": 5.16864,
+            "1509": 5.3204,
+            "1510": 5.02703,
+            "1511": 5.1198,
+            "1512": 4.98354,
+            "1513": 5.1699,
+            "1514": 5.33407,
+            "1515": 5.36306,
+            "1516": 5.27572,
+            "1517": 5.2256,
+            "1518": 5.02899,
+            "1519": 5.29833,
+            "1520": 5.13757,
+            "1521": 5.15715,
+            "1522": 5.33462,
+            "1523": 5.24144,
+            "1524": 5.06791,
+            "1525": 5.20708,
+            "1526": 5.27861,
+            "1527": 5.25864,
+            "1528": 5.2395,
+            "1529": 5.18253,
+            "1530": 5.23913,
+            "1531": 5.09996,
+            "1532": 5.15679,
+            "1533": 5.05231,
+            "1534": 5.21917,
+            "1535": 5.16769,
+            "1536": 5.102,
+            "1537": 5.0318,
+            "1538": 4.91991,
+            "1539": 5.2394,
+            "1540": 5.11391,
+            "1541": 5.25502,
+            "1542": 5.23775,
+            "1543": 5.05438,
+            "1544": 5.08156,
+            "1545": 5.11794,
+            "1546": 5.32713,
+            "1547": 5.10763,
+            "1548": 5.23418,
+            "1549": 5.23089,
+            "1550": 4.97536,
+            "1551": 5.25942,
+            "1552": 5.0226,
+            "1553": 5.14887,
+            "1554": 5.11051,
+            "1555": 5.11223,
+            "1556": 5.19882,
+            "1557": 5.08844,
+            "1558": 5.22982,
+            "1559": 5.00137,
+            "1560": 5.11269,
+            "1561": 5.14639,
+            "1562": 5.18443,
+            "1563": 5.24639,
+            "1564": 5.26429,
+            "1565": 5.08809,
+            "1566": 5.29393,
+            "1567": 5.04372,
+            "1568": 5.08304,
+            "1569": 5.2002,
+            "1570": 5.17168,
+            "1571": 4.95228,
+            "1572": 5.04524,
+            "1573": 5.02748,
+            "1574": 4.99831,
+            "1575": 5.23124,
+            "1576": 5.20891,
+            "1577": 5.12722,
+            "1578": 5.36355,
+            "1579": 4.94343,
+            "1580": 5.12556,
+            "1581": 5.09739,
+            "1582": 5.28014,
+            "1583": 5.04619,
+            "1584": 5.0566,
+            "1585": 5.11727,
+            "1586": 5.30646,
+            "1587": 5.13281,
+            "1588": 5.22351,
+            "1589": 4.83814,
+            "1590": 5.09825,
+            "1591": 5.18082,
+            "1592": 5.14078,
+            "1593": 5.23646,
+            "1594": 5.11532,
+            "1595": 5.10761,
+            "1596": 5.19194,
+            "1597": 5.11362,
+            "1598": 5.16252,
+            "1599": 5.18865,
+            "1600": 4.86676,
+            "1601": 5.11898,
+            "1602": 5.22827,
+            "1603": 5.19524,
+            "1604": 5.05797,
+            "1605": 5.03277,
+            "1606": 4.98991,
+            "1607": 5.06915,
+            "1608": 4.97927,
+            "1609": 5.07061,
+            "1610": 5.04561,
+            "1611": 4.9918,
+            "1612": 4.75806,
+            "1613": 5.03141,
+            "1614": 4.87811,
+            "1615": 5.07817,
+            "1616": 5.22549,
+            "1617": 5.06182,
+            "1618": 4.98945,
+            "1619": 5.18486,
+            "1620": 5.14429,
+            "1621": 5.31666,
+            "1622": 5.06737,
+            "1623": 5.15063,
+            "1624": 5.1305,
+            "1625": 5.12197,
+            "1626": 5.10206,
+            "1627": 5.1085,
+            "1628": 5.06234,
+            "1629": 4.93316,
+            "1630": 5.06616,
+            "1631": 5.05719,
+            "1632": 5.10145,
+            "1633": 4.97087,
+            "1634": 4.92194,
+            "1635": 5.05013,
+            "1636": 4.9202,
+            "1637": 5.22863,
+            "1638": 5.15783,
+            "1639": 4.9808,
+            "1640": 5.00716,
+            "1641": 5.12367,
+            "1642": 5.0869,
+            "1643": 5.05029,
+            "1644": 5.12283,
+            "1645": 4.96415,
+            "1646": 5.12257,
+            "1647": 5.03267,
+            "1648": 5.1903,
+            "1649": 4.92263,
+            "1650": 5.0596,
+            "1651": 4.93391,
+            "1652": 5.21143,
+            "1653": 5.1587,
+            "1654": 5.13384,
+            "1655": 5.16235,
+            "1656": 5.34793,
+            "1657": 5.21074,
+            "1658": 5.04155,
+            "1659": 4.92889,
+            "1660": 4.8117,
+            "1661": 5.02968,
+            "1662": 5.14515,
+            "1663": 5.15868,
+            "1664": 4.98471,
+            "1665": 5.11027,
+            "1666": 5.10315,
+            "1667": 4.84929,
+            "1668": 5.10956,
+            "1669": 5.07311,
+            "1670": 5.11152,
+            "1671": 5.16545,
+            "1672": 4.77709,
+            "1673": 5.03502,
+            "1674": 4.91572,
+            "1675": 5.04406,
+            "1676": 5.0023,
+            "1677": 4.80013,
+            "1678": 5.02745,
+            "1679": 4.88908,
+            "1680": 5.03791,
+            "1681": 5.06371,
+            "1682": 5.03586,
+            "1683": 4.90255,
+            "1684": 5.06133,
+            "1685": 5.13096,
+            "1686": 5.075,
+            "1687": 4.97679,
+            "1688": 5.17279,
+            "1689": 5.1507,
+            "1690": 4.99681,
+            "1691": 4.99961,
+            "1692": 4.91412,
+            "1693": 5.02305,
+            "1694": 4.94741,
+            "1695": 4.91895,
+            "1696": 5.0846,
+            "1697": 5.05067,
+            "1698": 4.95116,
+            "1699": 5.00638,
+            "1700": 4.94576,
+            "1701": 5.16681,
+            "1702": 5.07316,
+            "1703": 5.16582,
+            "1704": 5.14235,
+            "1705": 4.96408,
+            "1706": 4.98303,
+            "1707": 4.78833,
+            "1708": 5.03283,
+            "1709": 5.2281,
+            "1710": 5.02918,
+            "1711": 5.18873,
+            "1712": 5.19088,
+            "1713": 5.03631,
+            "1714": 5.04689,
+            "1715": 4.91662,
+            "1716": 4.93663,
+            "1717": 4.86445,
+            "1718": 5.02654,
+            "1719": 5.12575,
+            "1720": 5.02353,
+            "1721": 4.9343,
+            "1722": 5.06572,
+            "1723": 4.93302,
+            "1724": 5.03906,
+            "1725": 5.19169,
+            "1726": 5.06497,
+            "1727": 4.91076,
+            "1728": 5.01922,
+            "1729": 5.04885,
+            "1730": 4.91107,
+            "1731": 5.00108,
+            "1732": 4.91468,
+            "1733": 5.12873,
+            "1734": 4.83023,
+            "1735": 5.21293,
+            "1736": 4.91729,
+            "1737": 4.86164,
+            "1738": 4.97933,
+            "1739": 5.16149,
+            "1740": 4.84041,
+            "1741": 4.78298,
+            "1742": 4.91062,
+            "1743": 5.09353,
+            "1744": 4.98531,
+            "1745": 4.82544,
+            "1746": 4.94973,
+            "1747": 4.86843,
+            "1748": 5.06696,
+            "1749": 4.86793,
+            "1750": 5.01333,
+            "1751": 5.12023,
+            "1752": 4.90813,
+            "1753": 5.09204,
+            "1754": 5.05813,
+            "1755": 4.89777,
+            "1756": 5.02216,
+            "1757": 5.14157,
+            "1758": 4.87188,
+            "1759": 4.94434,
+            "1760": 4.83222,
+            "1761": 5.02427,
+            "1762": 4.81507,
+            "1763": 4.77391,
+            "1764": 4.93175,
+            "1765": 5.14727,
+            "1766": 5.33614,
+            "1767": 5.22331,
+            "1768": 4.94712,
+            "1769": 5.0043,
+            "1770": 4.98512,
+            "1771": 4.96473,
+            "1772": 4.98299,
+            "1773": 4.97266,
+            "1774": 4.87138,
+            "1775": 4.9493,
+            "1776": 4.9958,
+            "1777": 4.94665,
+            "1778": 4.99288,
+            "1779": 5.08212,
+            "1780": 4.83608,
+            "1781": 5.05478,
+            "1782": 4.99549,
+            "1783": 5.01236,
+            "1784": 4.93254,
+            "1785": 5.16842,
+            "1786": 4.80892,
+            "1787": 4.9699,
+            "1788": 4.82948,
+            "1789": 4.88554,
+            "1790": 4.80386,
+            "1791": 4.74542,
+            "1792": 4.87988,
+            "1793": 5.11081,
+            "1794": 4.98659,
+            "1795": 4.97147,
+            "1796": 5.00354,
+            "1797": 4.79101,
+            "1798": 4.77029,
+            "1799": 5.01913,
+            "1800": 4.91155,
+            "1801": 5.04891,
+            "1802": 4.82591,
+            "1803": 4.95313,
+            "1804": 4.88492,
+            "1805": 4.90634,
+            "1806": 4.88167,
+            "1807": 4.92894,
+            "1808": 4.92469,
+            "1809": 5.15028,
+            "1810": 5.09708,
+            "1811": 4.96325,
+            "1812": 4.8059,
+            "1813": 5.1023,
+            "1814": 4.7819,
+            "1815": 4.86518,
+            "1816": 5.05104,
+            "1817": 4.79238,
+            "1818": 4.80401,
+            "1819": 5.02672,
+            "1820": 4.68884,
+            "1821": 5.02319,
+            "1822": 4.66224,
+            "1823": 4.86936,
+            "1824": 4.7914,
+            "1825": 5.06607,
+            "1826": 4.81841,
+            "1827": 4.79544,
+            "1828": 4.9506,
+            "1829": 5.10848,
+            "1830": 4.9163,
+            "1831": 4.89965,
+            "1832": 4.83328,
+            "1833": 4.78854,
+            "1834": 4.94794,
+            "1835": 4.96175,
+            "1836": 4.91339,
+            "1837": 4.6762,
+            "1838": 4.80703,
+            "1839": 4.89949,
+            "1840": 4.91213,
+            "1841": 4.84083,
+            "1842": 4.9567,
+            "1843": 4.71182,
+            "1844": 4.6194,
+            "1845": 5.00584,
+            "1846": 4.75435,
+            "1847": 4.86491,
+            "1848": 4.9035,
+            "1849": 4.85124,
+            "1850": 4.87005,
+            "1851": 5.01617,
+            "1852": 4.97859,
+            "1853": 4.82821,
+            "1854": 4.86426,
+            "1855": 4.82455,
+            "1856": 4.75214,
+            "1857": 4.96641,
+            "1858": 4.96711,
+            "1859": 4.7484,
+            "1860": 4.86558,
+            "1861": 5.21257,
+            "1862": 4.61253,
+            "1863": 4.83567,
+            "1864": 4.74748,
+            "1865": 4.86472,
+            "1866": 4.78934,
+            "1867": 5.00307,
+            "1868": 4.72073,
+            "1869": 4.76301,
+            "1870": 4.93972,
+            "1871": 5.00163,
+            "1872": 4.68713,
+            "1873": 4.70038,
+            "1874": 4.85131,
+            "1875": 4.85367,
+            "1876": 4.74378,
+            "1877": 4.80696,
+            "1878": 4.8139,
+            "1879": 4.82462,
+            "1880": 4.89248,
+            "1881": 4.79379,
+            "1882": 4.79882,
+            "1883": 4.78556,
+            "1884": 4.97714,
+            "1885": 4.92363,
+            "1886": 4.82454,
+            "1887": 4.82091,
+            "1888": 4.97246,
+            "1889": 4.96553,
+            "1890": 4.71236,
+            "1891": 4.65764,
+            "1892": 4.85277,
+            "1893": 4.65022,
+            "1894": 4.90165,
+            "1895": 4.79,
+            "1896": 4.66068,
+            "1897": 4.79617,
+            "1898": 4.92161,
+            "1899": 4.77736,
+            "1900": 4.91325,
+            "1901": 4.84998,
+            "1902": 4.787,
+            "1903": 4.76372,
+            "1904": 4.65638,
+            "1905": 4.55077,
+            "1906": 4.81577,
+            "1907": 4.9106,
+            "1908": 5.03029,
+            "1909": 4.89294,
+            "1910": 4.7884,
+            "1911": 4.81269,
+            "1912": 4.653,
+            "1913": 4.95098,
+            "1914": 4.88806,
+            "1915": 4.86687,
+            "1916": 4.9302,
+            "1917": 4.85504,
+            "1918": 4.87427,
+            "1919": 4.99557,
+            "1920": 4.77001,
+            "1921": 4.88729,
+            "1922": 4.8196,
+            "1923": 4.75752,
+            "1924": 4.8297,
+            "1925": 5.05687,
+            "1926": 4.94229,
+            "1927": 4.93308,
+            "1928": 4.92739,
+            "1929": 4.93147,
+            "1930": 4.917,
+            "1931": 4.77692,
+            "1932": 4.86743,
+            "1933": 4.83532,
+            "1934": 4.84373,
+            "1935": 5.11279,
+            "1936": 4.88728,
+            "1937": 4.8824,
+            "1938": 4.80623,
+            "1939": 4.70831,
+            "1940": 4.83067,
+            "1941": 4.74224,
+            "1942": 4.87785,
+            "1943": 4.74082,
+            "1944": 4.7536,
+            "1945": 4.69017,
+            "1946": 4.91953,
+            "1947": 4.87613,
+            "1948": 4.60452,
+            "1949": 4.89888,
+            "1950": 4.79826,
+            "1951": 4.9677,
+            "1952": 4.73855,
+            "1953": 4.79852,
+            "1954": 4.7398,
+            "1955": 4.85209,
+            "1956": 4.88278,
+            "1957": 4.73599,
+            "1958": 4.70215,
+            "1959": 4.76471,
+            "1960": 4.76967,
+            "1961": 4.71471,
+            "1962": 4.83443,
+            "1963": 4.82459,
+            "1964": 4.85019,
+            "1965": 4.87867,
+            "1966": 4.79219,
+            "1967": 4.60013,
+            "1968": 4.83399,
+            "1969": 4.59632,
+            "1970": 4.58346,
+            "1971": 4.90585,
+            "1972": 4.89941,
+            "1973": 4.55559,
+            "1974": 4.8295,
+            "1975": 4.83261,
+            "1976": 4.71818,
+            "1977": 4.58171,
+            "1978": 5.00781,
+            "1979": 4.6663,
+            "1980": 4.74961,
+            "1981": 4.87741,
+            "1982": 4.72647,
+            "1983": 4.89363,
+            "1984": 4.64954,
+            "1985": 4.78941,
+            "1986": 4.70195,
+            "1987": 4.8185,
+            "1988": 4.89272,
+            "1989": 4.63799,
+            "1990": 4.79789,
+            "1991": 4.70399,
+            "1992": 4.80349,
+            "1993": 4.74121,
+            "1994": 4.85611,
+            "1995": 4.5595,
+            "1996": 4.65792,
+            "1997": 4.8133,
+            "1998": 4.68041,
+            "1999": 4.73244,
+            "2000": 4.6301
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 26.0,
+            "2": 32.0,
+            "3": 38.0,
+            "4": 33.0,
+            "5": 32.0,
+            "6": 30.0,
+            "7": 33.0,
+            "8": 34.0,
+            "9": 40.0,
+            "10": 31.0,
+            "11": 26.0,
+            "12": 33.0,
+            "13": 28.0,
+            "14": 29.0,
+            "15": 28.0,
+            "16": 27.0,
+            "17": 32.0,
+            "18": 28.0,
+            "19": 31.0,
+            "20": 39.0,
+            "21": 22.0,
+            "22": 29.0,
+            "23": 39.0,
+            "24": 35.0,
+            "25": 31.0,
+            "26": 40.0,
+            "27": 39.0,
+            "28": 42.0,
+            "29": 53.0,
+            "30": 51.0,
+            "31": 48.0,
+            "32": 51.0,
+            "33": 38.0,
+            "34": 48.0,
+            "35": 47.0,
+            "36": 49.0,
+            "37": 42.0,
+            "38": 43.0,
+            "39": 52.0,
+            "40": 55.0,
+            "41": 39.0,
+            "42": 54.0,
+            "43": 57.0,
+            "44": 53.0,
+            "45": 46.0,
+            "46": 61.0,
+            "47": 52.0,
+            "48": 54.0,
+            "49": 64.0,
+            "50": 64.0,
+            "51": 42.0,
+            "52": 55.0,
+            "53": 48.0,
+            "54": 71.0,
+            "55": 56.0,
+            "56": 74.0,
+            "57": 70.0,
+            "58": 57.0,
+            "59": 53.0,
+            "60": 67.0,
+            "61": 63.0,
+            "62": 59.0,
+            "63": 66.0,
+            "64": 70.0,
+            "65": 59.0,
+            "66": 74.0,
+            "67": 81.0,
+            "68": 74.0,
+            "69": 60.0,
+            "70": 60.0,
+            "71": 66.0,
+            "72": 75.0,
+            "73": 67.0,
+            "74": 63.0,
+            "75": 60.0,
+            "76": 60.0,
+            "77": 78.0,
+            "78": 78.0,
+            "79": 58.0,
+            "80": 63.0,
+            "81": 63.0,
+            "82": 50.0,
+            "83": 63.0,
+            "84": 72.0,
+            "85": 69.0,
+            "86": 80.0,
+            "87": 70.0,
+            "88": 68.0,
+            "89": 69.0,
+            "90": 63.0,
+            "91": 58.0,
+            "92": 87.0,
+            "93": 65.0,
+            "94": 50.0,
+            "95": 67.0,
+            "96": 71.0,
+            "97": 70.0,
+            "98": 81.0,
+            "99": 66.0,
+            "100": 76.0,
+            "101": 67.0,
+            "102": 44.0,
+            "103": 60.0,
+            "104": 68.0,
+            "105": 84.0,
+            "106": 61.0,
+            "107": 76.0,
+            "108": 68.0,
+            "109": 76.0,
+            "110": 74.0,
+            "111": 75.0,
+            "112": 78.0,
+            "113": 58.0,
+            "114": 66.0,
+            "115": 71.0,
+            "116": 63.0,
+            "117": 74.0,
+            "118": 52.0,
+            "119": 74.0,
+            "120": 52.0,
+            "121": 76.0,
+            "122": 66.0,
+            "123": 81.0,
+            "124": 76.0,
+            "125": 87.0,
+            "126": 49.0,
+            "127": 56.0,
+            "128": 78.0,
+            "129": 53.0,
+            "130": 76.0,
+            "131": 86.0,
+            "132": 61.0,
+            "133": 72.0,
+            "134": 62.0,
+            "135": 59.0,
+            "136": 60.0,
+            "137": 57.0,
+            "138": 81.0,
+            "139": 74.0,
+            "140": 59.0,
+            "141": 50.0,
+            "142": 64.0,
+            "143": 54.0,
+            "144": 49.0,
+            "145": 57.0,
+            "146": 51.0,
+            "147": 49.0,
+            "148": 69.0,
+            "149": 49.0,
+            "150": 66.0,
+            "151": 57.0,
+            "152": 51.0,
+            "153": 61.0,
+            "154": 58.0,
+            "155": 68.0,
+            "156": 68.0,
+            "157": 51.0,
+            "158": 68.0,
+            "159": 60.0,
+            "160": 64.0,
+            "161": 66.0,
+            "162": 75.0,
+            "163": 40.0,
+            "164": 84.0,
+            "165": 50.0,
+            "166": 68.0,
+            "167": 54.0,
+            "168": 58.0,
+            "169": 65.0,
+            "170": 71.0,
+            "171": 54.0,
+            "172": 64.0,
+            "173": 81.0,
+            "174": 55.0,
+            "175": 63.0,
+            "176": 69.0,
+            "177": 80.0,
+            "178": 68.0,
+            "179": 69.0,
+            "180": 64.0,
+            "181": 41.0,
+            "182": 63.0,
+            "183": 66.0,
+            "184": 67.0,
+            "185": 77.0,
+            "186": 77.0,
+            "187": 61.0,
+            "188": 62.0,
+            "189": 50.0,
+            "190": 57.0,
+            "191": 60.0,
+            "192": 67.0,
+            "193": 70.0,
+            "194": 72.0,
+            "195": 60.0,
+            "196": 81.0,
+            "197": 56.0,
+            "198": 47.0,
+            "199": 50.0,
+            "200": 86.0,
+            "201": 52.0,
+            "202": 64.0,
+            "203": 58.0,
+            "204": 63.0,
+            "205": 40.0,
+            "206": 72.0,
+            "207": 50.0,
+            "208": 42.0,
+            "209": 69.0,
+            "210": 68.0,
+            "211": 56.0,
+            "212": 64.0,
+            "213": 60.0,
+            "214": 62.0,
+            "215": 66.0,
+            "216": 58.0,
+            "217": 59.0,
+            "218": 70.0,
+            "219": 80.0,
+            "220": 81.0,
+            "221": 51.0,
+            "222": 57.0,
+            "223": 67.0,
+            "224": 53.0,
+            "225": 61.0,
+            "226": 68.0,
+            "227": 76.0,
+            "228": 59.0,
+            "229": 44.0,
+            "230": 50.0,
+            "231": 58.0,
+            "232": 65.0,
+            "233": 90.0,
+            "234": 60.0,
+            "235": 98.0,
+            "236": 49.0,
+            "237": 92.0,
+            "238": 71.0,
+            "239": 68.0,
+            "240": 79.0,
+            "241": 67.0,
+            "242": 75.0,
+            "243": 66.0,
+            "244": 59.0,
+            "245": 81.0,
+            "246": 80.0,
+            "247": 88.0,
+            "248": 81.0,
+            "249": 79.0,
+            "250": 80.0,
+            "251": 74.0,
+            "252": 72.0,
+            "253": 57.0,
+            "254": 67.0,
+            "255": 79.0,
+            "256": 86.0,
+            "257": 66.0,
+            "258": 94.0,
+            "259": 69.0,
+            "260": 70.0,
+            "261": 64.0,
+            "262": 77.0,
+            "263": 74.0,
+            "264": 70.0,
+            "265": 68.0,
+            "266": 67.0,
+            "267": 66.0,
+            "268": 59.0,
+            "269": 73.0,
+            "270": 85.0,
+            "271": 67.0,
+            "272": 81.0,
+            "273": 71.0,
+            "274": 69.0,
+            "275": 72.0,
+            "276": 72.0,
+            "277": 82.0,
+            "278": 61.0,
+            "279": 94.0,
+            "280": 56.0,
+            "281": 55.0,
+            "282": 73.0,
+            "283": 90.0,
+            "284": 85.0,
+            "285": 49.0,
+            "286": 50.0,
+            "287": 90.0,
+            "288": 71.0,
+            "289": 85.0,
+            "290": 75.0,
+            "291": 88.0,
+            "292": 88.0,
+            "293": 91.0,
+            "294": 84.0,
+            "295": 85.0,
+            "296": 102.0,
+            "297": 70.0,
+            "298": 65.0,
+            "299": 80.0,
+            "300": 80.0,
+            "301": 91.0,
+            "302": 94.0,
+            "303": 71.0,
+            "304": 74.0,
+            "305": 59.0,
+            "306": 72.0,
+            "307": 73.0,
+            "308": 91.0,
+            "309": 88.0,
+            "310": 82.0,
+            "311": 84.0,
+            "312": 73.0,
+            "313": 97.0,
+            "314": 74.0,
+            "315": 69.0,
+            "316": 96.0,
+            "317": 61.0,
+            "318": 99.0,
+            "319": 67.0,
+            "320": 77.0,
+            "321": 86.0,
+            "322": 70.0,
+            "323": 86.0,
+            "324": 96.0,
+            "325": 74.0,
+            "326": 97.0,
+            "327": 73.0,
+            "328": 99.0,
+            "329": 93.0,
+            "330": 96.0,
+            "331": 81.0,
+            "332": 79.0,
+            "333": 97.0,
+            "334": 81.0,
+            "335": 84.0,
+            "336": 81.0,
+            "337": 99.0,
+            "338": 89.0,
+            "339": 93.0,
+            "340": 101.0,
+            "341": 93.0,
+            "342": 57.0,
+            "343": 81.0,
+            "344": 105.0,
+            "345": 88.0,
+            "346": 85.0,
+            "347": 91.0,
+            "348": 82.0,
+            "349": 78.0,
+            "350": 101.0,
+            "351": 105.0,
+            "352": 76.0,
+            "353": 112.0,
+            "354": 72.0,
+            "355": 79.0,
+            "356": 104.0,
+            "357": 86.0,
+            "358": 77.0,
+            "359": 99.0,
+            "360": 102.0,
+            "361": 64.0,
+            "362": 123.0,
+            "363": 96.0,
+            "364": 95.0,
+            "365": 85.0,
+            "366": 82.0,
+            "367": 84.0,
+            "368": 83.0,
+            "369": 77.0,
+            "370": 118.0,
+            "371": 76.0,
+            "372": 77.0,
+            "373": 96.0,
+            "374": 68.0,
+            "375": 92.0,
+            "376": 84.0,
+            "377": 98.0,
+            "378": 99.0,
+            "379": 108.0,
+            "380": 96.0,
+            "381": 92.0,
+            "382": 75.0,
+            "383": 89.0,
+            "384": 100.0,
+            "385": 73.0,
+            "386": 85.0,
+            "387": 73.0,
+            "388": 93.0,
+            "389": 88.0,
+            "390": 90.0,
+            "391": 115.0,
+            "392": 88.0,
+            "393": 99.0,
+            "394": 104.0,
+            "395": 125.0,
+            "396": 80.0,
+            "397": 78.0,
+            "398": 67.0,
+            "399": 104.0,
+            "400": 96.0,
+            "401": 105.0,
+            "402": 88.0,
+            "403": 97.0,
+            "404": 101.0,
+            "405": 85.0,
+            "406": 114.0,
+            "407": 76.0,
+            "408": 98.0,
+            "409": 84.0,
+            "410": 102.0,
+            "411": 81.0,
+            "412": 56.0,
+            "413": 68.0,
+            "414": 90.0,
+            "415": 95.0,
+            "416": 93.0,
+            "417": 90.0,
+            "418": 60.0,
+            "419": 86.0,
+            "420": 76.0,
+            "421": 110.0,
+            "422": 89.0,
+            "423": 78.0,
+            "424": 82.0,
+            "425": 94.0,
+            "426": 80.0,
+            "427": 96.0,
+            "428": 86.0,
+            "429": 92.0,
+            "430": 84.0,
+            "431": 87.0,
+            "432": 80.0,
+            "433": 81.0,
+            "434": 93.0,
+            "435": 83.0,
+            "436": 82.0,
+            "437": 91.0,
+            "438": 62.0,
+            "439": 72.0,
+            "440": 79.0,
+            "441": 87.0,
+            "442": 106.0,
+            "443": 106.0,
+            "444": 58.0,
+            "445": 93.0,
+            "446": 89.0,
+            "447": 97.0,
+            "448": 79.0,
+            "449": 90.0,
+            "450": 83.0,
+            "451": 63.0,
+            "452": 70.0,
+            "453": 63.0,
+            "454": 80.0,
+            "455": 114.0,
+            "456": 98.0,
+            "457": 101.0,
+            "458": 70.0,
+            "459": 69.0,
+            "460": 65.0,
+            "461": 115.0,
+            "462": 63.0,
+            "463": 73.0,
+            "464": 69.0,
+            "465": 95.0,
+            "466": 76.0,
+            "467": 77.0,
+            "468": 90.0,
+            "469": 65.0,
+            "470": 91.0,
+            "471": 76.0,
+            "472": 60.0,
+            "473": 94.0,
+            "474": 69.0,
+            "475": 90.0,
+            "476": 66.0,
+            "477": 75.0,
+            "478": 78.0,
+            "479": 63.0,
+            "480": 73.0,
+            "481": 80.0,
+            "482": 77.0,
+            "483": 78.0,
+            "484": 84.0,
+            "485": 70.0,
+            "486": 84.0,
+            "487": 69.0,
+            "488": 88.0,
+            "489": 77.0,
+            "490": 59.0,
+            "491": 83.0,
+            "492": 57.0,
+            "493": 83.0,
+            "494": 69.0,
+            "495": 50.0,
+            "496": 56.0,
+            "497": 97.0,
+            "498": 77.0,
+            "499": 75.0,
+            "500": 60.0,
+            "501": 64.0,
+            "502": 64.0,
+            "503": 71.0,
+            "504": 77.0,
+            "505": 68.0,
+            "506": 65.0,
+            "507": 80.0,
+            "508": 42.0,
+            "509": 63.0,
+            "510": 77.0,
+            "511": 81.0,
+            "512": 57.0,
+            "513": 61.0,
+            "514": 60.0,
+            "515": 71.0,
+            "516": 61.0,
+            "517": 85.0,
+            "518": 43.0,
+            "519": 72.0,
+            "520": 82.0,
+            "521": 50.0,
+            "522": 58.0,
+            "523": 74.0,
+            "524": 70.0,
+            "525": 82.0,
+            "526": 60.0,
+            "527": 71.0,
+            "528": 63.0,
+            "529": 66.0,
+            "530": 67.0,
+            "531": 69.0,
+            "532": 72.0,
+            "533": 81.0,
+            "534": 62.0,
+            "535": 66.0,
+            "536": 61.0,
+            "537": 60.0,
+            "538": 55.0,
+            "539": 62.0,
+            "540": 63.0,
+            "541": 61.0,
+            "542": 61.0,
+            "543": 55.0,
+            "544": 64.0,
+            "545": 73.0,
+            "546": 77.0,
+            "547": 69.0,
+            "548": 75.0,
+            "549": 61.0,
+            "550": 61.0,
+            "551": 63.0,
+            "552": 71.0,
+            "553": 78.0,
+            "554": 67.0,
+            "555": 65.0,
+            "556": 74.0,
+            "557": 61.0,
+            "558": 62.0,
+            "559": 62.0,
+            "560": 71.0,
+            "561": 56.0,
+            "562": 65.0,
+            "563": 77.0,
+            "564": 67.0,
+            "565": 55.0,
+            "566": 58.0,
+            "567": 42.0,
+            "568": 70.0,
+            "569": 56.0,
+            "570": 60.0,
+            "571": 58.0,
+            "572": 41.0,
+            "573": 71.0,
+            "574": 69.0,
+            "575": 85.0,
+            "576": 44.0,
+            "577": 50.0,
+            "578": 69.0,
+            "579": 62.0,
+            "580": 67.0,
+            "581": 59.0,
+            "582": 58.0,
+            "583": 55.0,
+            "584": 47.0,
+            "585": 60.0,
+            "586": 41.0,
+            "587": 47.0,
+            "588": 53.0,
+            "589": 55.0,
+            "590": 46.0,
+            "591": 69.0,
+            "592": 50.0,
+            "593": 52.0,
+            "594": 56.0,
+            "595": 47.0,
+            "596": 44.0,
+            "597": 33.0,
+            "598": 61.0,
+            "599": 50.0,
+            "600": 88.0,
+            "601": 55.0,
+            "602": 64.0,
+            "603": 60.0,
+            "604": 57.0,
+            "605": 57.0,
+            "606": 45.0,
+            "607": 54.0,
+            "608": 45.0,
+            "609": 40.0,
+            "610": 45.0,
+            "611": 53.0,
+            "612": 52.0,
+            "613": 73.0,
+            "614": 53.0,
+            "615": 52.0,
+            "616": 64.0,
+            "617": 44.0,
+            "618": 59.0,
+            "619": 50.0,
+            "620": 72.0,
+            "621": 50.0,
+            "622": 58.0,
+            "623": 57.0,
+            "624": 56.0,
+            "625": 56.0,
+            "626": 71.0,
+            "627": 50.0,
+            "628": 49.0,
+            "629": 50.0,
+            "630": 50.0,
+            "631": 40.0,
+            "632": 45.0,
+            "633": 42.0,
+            "634": 38.0,
+            "635": 51.0,
+            "636": 36.0,
+            "637": 55.0,
+            "638": 45.0,
+            "639": 63.0,
+            "640": 52.0,
+            "641": 51.0,
+            "642": 52.0,
+            "643": 49.0,
+            "644": 51.0,
+            "645": 57.0,
+            "646": 57.0,
+            "647": 69.0,
+            "648": 60.0,
+            "649": 49.0,
+            "650": 49.0,
+            "651": 66.0,
+            "652": 49.0,
+            "653": 59.0,
+            "654": 42.0,
+            "655": 42.0,
+            "656": 46.0,
+            "657": 49.0,
+            "658": 50.0,
+            "659": 44.0,
+            "660": 53.0,
+            "661": 46.0,
+            "662": 60.0,
+            "663": 43.0,
+            "664": 61.0,
+            "665": 37.0,
+            "666": 30.0,
+            "667": 42.0,
+            "668": 41.0,
+            "669": 44.0,
+            "670": 44.0,
+            "671": 59.0,
+            "672": 53.0,
+            "673": 47.0,
+            "674": 42.0,
+            "675": 54.0,
+            "676": 43.0,
+            "677": 68.0,
+            "678": 41.0,
+            "679": 38.0,
+            "680": 46.0,
+            "681": 50.0,
+            "682": 33.0,
+            "683": 38.0,
+            "684": 52.0,
+            "685": 40.0,
+            "686": 43.0,
+            "687": 61.0,
+            "688": 57.0,
+            "689": 51.0,
+            "690": 35.0,
+            "691": 45.0,
+            "692": 55.0,
+            "693": 36.0,
+            "694": 50.0,
+            "695": 50.0,
+            "696": 51.0,
+            "697": 41.0,
+            "698": 37.0,
+            "699": 47.0,
+            "700": 42.0,
+            "701": 37.0,
+            "702": 33.0,
+            "703": 39.0,
+            "704": 43.0,
+            "705": 45.0,
+            "706": 32.0,
+            "707": 38.0,
+            "708": 38.0,
+            "709": 46.0,
+            "710": 35.0,
+            "711": 48.0,
+            "712": 35.0,
+            "713": 48.0,
+            "714": 37.0,
+            "715": 48.0,
+            "716": 36.0,
+            "717": 34.0,
+            "718": 26.0,
+            "719": 36.0,
+            "720": 34.0,
+            "721": 36.0,
+            "722": 35.0,
+            "723": 29.0,
+            "724": 47.0,
+            "725": 32.0,
+            "726": 39.0,
+            "727": 40.0,
+            "728": 39.0,
+            "729": 47.0,
+            "730": 36.0,
+            "731": 48.0,
+            "732": 43.0,
+            "733": 39.0,
+            "734": 51.0,
+            "735": 40.0,
+            "736": 49.0,
+            "737": 44.0,
+            "738": 27.0,
+            "739": 46.0,
+            "740": 38.0,
+            "741": 38.0,
+            "742": 45.0,
+            "743": 44.0,
+            "744": 52.0,
+            "745": 48.0,
+            "746": 50.0,
+            "747": 53.0,
+            "748": 52.0,
+            "749": 48.0,
+            "750": 46.0,
+            "751": 40.0,
+            "752": 50.0,
+            "753": 44.0,
+            "754": 43.0,
+            "755": 48.0,
+            "756": 38.0,
+            "757": 45.0,
+            "758": 40.0,
+            "759": 56.0,
+            "760": 46.0,
+            "761": 44.0,
+            "762": 48.0,
+            "763": 54.0,
+            "764": 49.0,
+            "765": 42.0,
+            "766": 57.0,
+            "767": 45.0,
+            "768": 51.0,
+            "769": 60.0,
+            "770": 51.0,
+            "771": 31.0,
+            "772": 41.0,
+            "773": 60.0,
+            "774": 37.0,
+            "775": 43.0,
+            "776": 37.0,
+            "777": 34.0,
+            "778": 42.0,
+            "779": 37.0,
+            "780": 34.0,
+            "781": 41.0,
+            "782": 25.0,
+            "783": 30.0,
+            "784": 39.0,
+            "785": 34.0,
+            "786": 38.0,
+            "787": 47.0,
+            "788": 41.0,
+            "789": 50.0,
+            "790": 44.0,
+            "791": 34.0,
+            "792": 38.0,
+            "793": 53.0,
+            "794": 45.0,
+            "795": 52.0,
+            "796": 39.0,
+            "797": 41.0,
+            "798": 39.0,
+            "799": 44.0,
+            "800": 46.0,
+            "801": 44.0,
+            "802": 40.0,
+            "803": 47.0,
+            "804": 34.0,
+            "805": 45.0,
+            "806": 43.0,
+            "807": 46.0,
+            "808": 36.0,
+            "809": 35.0,
+            "810": 35.0,
+            "811": 44.0,
+            "812": 47.0,
+            "813": 41.0,
+            "814": 36.0,
+            "815": 41.0,
+            "816": 52.0,
+            "817": 43.0,
+            "818": 35.0,
+            "819": 52.0,
+            "820": 40.0,
+            "821": 29.0,
+            "822": 34.0,
+            "823": 44.0,
+            "824": 47.0,
+            "825": 36.0,
+            "826": 40.0,
+            "827": 29.0,
+            "828": 35.0,
+            "829": 32.0,
+            "830": 30.0,
+            "831": 36.0,
+            "832": 34.0,
+            "833": 39.0,
+            "834": 50.0,
+            "835": 38.0,
+            "836": 37.0,
+            "837": 50.0,
+            "838": 45.0,
+            "839": 52.0,
+            "840": 37.0,
+            "841": 35.0,
+            "842": 30.0,
+            "843": 50.0,
+            "844": 23.0,
+            "845": 45.0,
+            "846": 25.0,
+            "847": 32.0,
+            "848": 25.0,
+            "849": 34.0,
+            "850": 39.0,
+            "851": 46.0,
+            "852": 41.0,
+            "853": 43.0,
+            "854": 45.0,
+            "855": 27.0,
+            "856": 47.0,
+            "857": 47.0,
+            "858": 46.0,
+            "859": 35.0,
+            "860": 45.0,
+            "861": 30.0,
+            "862": 39.0,
+            "863": 21.0,
+            "864": 26.0,
+            "865": 46.0,
+            "866": 44.0,
+            "867": 48.0,
+            "868": 27.0,
+            "869": 42.0,
+            "870": 45.0,
+            "871": 33.0,
+            "872": 49.0,
+            "873": 32.0,
+            "874": 56.0,
+            "875": 38.0,
+            "876": 41.0,
+            "877": 40.0,
+            "878": 37.0,
+            "879": 22.0,
+            "880": 39.0,
+            "881": 40.0,
+            "882": 49.0,
+            "883": 39.0,
+            "884": 35.0,
+            "885": 32.0,
+            "886": 45.0,
+            "887": 41.0,
+            "888": 34.0,
+            "889": 35.0,
+            "890": 37.0,
+            "891": 41.0,
+            "892": 42.0,
+            "893": 42.0,
+            "894": 34.0,
+            "895": 38.0,
+            "896": 37.0,
+            "897": 41.0,
+            "898": 33.0,
+            "899": 35.0,
+            "900": 39.0,
+            "901": 37.0,
+            "902": 39.0,
+            "903": 42.0,
+            "904": 38.0,
+            "905": 32.0,
+            "906": 34.0,
+            "907": 38.0,
+            "908": 39.0,
+            "909": 52.0,
+            "910": 34.0,
+            "911": 26.0,
+            "912": 46.0,
+            "913": 40.0,
+            "914": 48.0,
+            "915": 25.0,
+            "916": 49.0,
+            "917": 36.0,
+            "918": 31.0,
+            "919": 26.0,
+            "920": 40.0,
+            "921": 34.0,
+            "922": 38.0,
+            "923": 41.0,
+            "924": 24.0,
+            "925": 27.0,
+            "926": 43.0,
+            "927": 31.0,
+            "928": 40.0,
+            "929": 32.0,
+            "930": 42.0,
+            "931": 33.0,
+            "932": 34.0,
+            "933": 38.0,
+            "934": 41.0,
+            "935": 26.0,
+            "936": 44.0,
+            "937": 36.0,
+            "938": 37.0,
+            "939": 28.0,
+            "940": 33.0,
+            "941": 34.0,
+            "942": 31.0,
+            "943": 26.0,
+            "944": 37.0,
+            "945": 29.0,
+            "946": 31.0,
+            "947": 34.0,
+            "948": 41.0,
+            "949": 31.0,
+            "950": 35.0,
+            "951": 31.0,
+            "952": 38.0,
+            "953": 47.0,
+            "954": 43.0,
+            "955": 46.0,
+            "956": 35.0,
+            "957": 40.0,
+            "958": 37.0,
+            "959": 52.0,
+            "960": 35.0,
+            "961": 38.0,
+            "962": 41.0,
+            "963": 45.0,
+            "964": 43.0,
+            "965": 51.0,
+            "966": 38.0,
+            "967": 31.0,
+            "968": 32.0,
+            "969": 35.0,
+            "970": 48.0,
+            "971": 38.0,
+            "972": 43.0,
+            "973": 38.0,
+            "974": 40.0,
+            "975": 43.0,
+            "976": 29.0,
+            "977": 44.0,
+            "978": 31.0,
+            "979": 43.0,
+            "980": 39.0,
+            "981": 33.0,
+            "982": 30.0,
+            "983": 54.0,
+            "984": 43.0,
+            "985": 48.0,
+            "986": 40.0,
+            "987": 30.0,
+            "988": 38.0,
+            "989": 38.0,
+            "990": 42.0,
+            "991": 36.0,
+            "992": 48.0,
+            "993": 47.0,
+            "994": 50.0,
+            "995": 35.0,
+            "996": 29.0,
+            "997": 51.0,
+            "998": 42.0,
+            "999": 35.0,
+            "1000": 28.0,
+            "1001": 23.0,
+            "1002": 35.0,
+            "1003": 39.0,
+            "1004": 46.0,
+            "1005": 42.0,
+            "1006": 27.0,
+            "1007": 44.0,
+            "1008": 32.0,
+            "1009": 34.0,
+            "1010": 29.0,
+            "1011": 31.0,
+            "1012": 28.0,
+            "1013": 37.0,
+            "1014": 29.0,
+            "1015": 39.0,
+            "1016": 31.0,
+            "1017": 37.0,
+            "1018": 46.0,
+            "1019": 26.0,
+            "1020": 34.0,
+            "1021": 30.0,
+            "1022": 46.0,
+            "1023": 38.0,
+            "1024": 49.0,
+            "1025": 41.0,
+            "1026": 55.0,
+            "1027": 37.0,
+            "1028": 29.0,
+            "1029": 38.0,
+            "1030": 35.0,
+            "1031": 41.0,
+            "1032": 42.0,
+            "1033": 27.0,
+            "1034": 29.0,
+            "1035": 32.0,
+            "1036": 25.0,
+            "1037": 34.0,
+            "1038": 32.0,
+            "1039": 31.0,
+            "1040": 30.0,
+            "1041": 24.0,
+            "1042": 20.0,
+            "1043": 26.0,
+            "1044": 44.0,
+            "1045": 37.0,
+            "1046": 34.0,
+            "1047": 27.0,
+            "1048": 36.0,
+            "1049": 42.0,
+            "1050": 37.0,
+            "1051": 40.0,
+            "1052": 40.0,
+            "1053": 32.0,
+            "1054": 37.0,
+            "1055": 31.0,
+            "1056": 36.0,
+            "1057": 37.0,
+            "1058": 37.0,
+            "1059": 35.0,
+            "1060": 32.0,
+            "1061": 37.0,
+            "1062": 45.0,
+            "1063": 38.0,
+            "1064": 42.0,
+            "1065": 35.0,
+            "1066": 36.0,
+            "1067": 29.0,
+            "1068": 30.0,
+            "1069": 30.0,
+            "1070": 39.0,
+            "1071": 33.0,
+            "1072": 36.0,
+            "1073": 41.0,
+            "1074": 47.0,
+            "1075": 36.0,
+            "1076": 39.0,
+            "1077": 45.0,
+            "1078": 32.0,
+            "1079": 46.0,
+            "1080": 43.0,
+            "1081": 40.0,
+            "1082": 42.0,
+            "1083": 42.0,
+            "1084": 42.0,
+            "1085": 38.0,
+            "1086": 42.0,
+            "1087": 36.0,
+            "1088": 31.0,
+            "1089": 42.0,
+            "1090": 28.0,
+            "1091": 36.0,
+            "1092": 35.0,
+            "1093": 36.0,
+            "1094": 41.0,
+            "1095": 37.0,
+            "1096": 48.0,
+            "1097": 33.0,
+            "1098": 24.0,
+            "1099": 43.0,
+            "1100": 41.0,
+            "1101": 38.0,
+            "1102": 39.0,
+            "1103": 29.0,
+            "1104": 33.0,
+            "1105": 38.0,
+            "1106": 37.0,
+            "1107": 30.0,
+            "1108": 41.0,
+            "1109": 41.0,
+            "1110": 42.0,
+            "1111": 43.0,
+            "1112": 25.0,
+            "1113": 40.0,
+            "1114": 32.0,
+            "1115": 34.0,
+            "1116": 45.0,
+            "1117": 40.0,
+            "1118": 39.0,
+            "1119": 31.0,
+            "1120": 28.0,
+            "1121": 28.0,
+            "1122": 28.0,
+            "1123": 43.0,
+            "1124": 34.0,
+            "1125": 26.0,
+            "1126": 33.0,
+            "1127": 31.0,
+            "1128": 33.0,
+            "1129": 43.0,
+            "1130": 43.0,
+            "1131": 40.0,
+            "1132": 42.0,
+            "1133": 34.0,
+            "1134": 32.0,
+            "1135": 29.0,
+            "1136": 36.0,
+            "1137": 42.0,
+            "1138": 34.0,
+            "1139": 31.0,
+            "1140": 38.0,
+            "1141": 37.0,
+            "1142": 38.0,
+            "1143": 44.0,
+            "1144": 40.0,
+            "1145": 39.0,
+            "1146": 42.0,
+            "1147": 35.0,
+            "1148": 29.0,
+            "1149": 40.0,
+            "1150": 34.0,
+            "1151": 27.0,
+            "1152": 22.0,
+            "1153": 36.0,
+            "1154": 31.0,
+            "1155": 41.0,
+            "1156": 26.0,
+            "1157": 33.0,
+            "1158": 35.0,
+            "1159": 36.0,
+            "1160": 41.0,
+            "1161": 40.0,
+            "1162": 48.0,
+            "1163": 37.0,
+            "1164": 43.0,
+            "1165": 34.0,
+            "1166": 30.0,
+            "1167": 34.0,
+            "1168": 31.0,
+            "1169": 41.0,
+            "1170": 27.0,
+            "1171": 40.0,
+            "1172": 34.0,
+            "1173": 23.0,
+            "1174": 40.0,
+            "1175": 30.0,
+            "1176": 50.0,
+            "1177": 39.0,
+            "1178": 33.0,
+            "1179": 42.0,
+            "1180": 31.0,
+            "1181": 30.0,
+            "1182": 38.0,
+            "1183": 37.0,
+            "1184": 35.0,
+            "1185": 31.0,
+            "1186": 29.0,
+            "1187": 39.0,
+            "1188": 34.0,
+            "1189": 48.0,
+            "1190": 32.0,
+            "1191": 41.0,
+            "1192": 45.0,
+            "1193": 28.0,
+            "1194": 46.0,
+            "1195": 34.0,
+            "1196": 38.0,
+            "1197": 51.0,
+            "1198": 36.0,
+            "1199": 40.0,
+            "1200": 29.0,
+            "1201": 37.0,
+            "1202": 32.0,
+            "1203": 35.0,
+            "1204": 37.0,
+            "1205": 56.0,
+            "1206": 40.0,
+            "1207": 36.0,
+            "1208": 41.0,
+            "1209": 31.0,
+            "1210": 39.0,
+            "1211": 46.0,
+            "1212": 45.0,
+            "1213": 57.0,
+            "1214": 31.0,
+            "1215": 33.0,
+            "1216": 31.0,
+            "1217": 34.0,
+            "1218": 42.0,
+            "1219": 45.0,
+            "1220": 37.0,
+            "1221": 44.0,
+            "1222": 32.0,
+            "1223": 35.0,
+            "1224": 34.0,
+            "1225": 45.0,
+            "1226": 28.0,
+            "1227": 34.0,
+            "1228": 27.0,
+            "1229": 23.0,
+            "1230": 25.0,
+            "1231": 14.0,
+            "1232": 36.0,
+            "1233": 39.0,
+            "1234": 37.0,
+            "1235": 32.0,
+            "1236": 41.0,
+            "1237": 30.0,
+            "1238": 36.0,
+            "1239": 37.0,
+            "1240": 48.0,
+            "1241": 31.0,
+            "1242": 34.0,
+            "1243": 35.0,
+            "1244": 29.0,
+            "1245": 28.0,
+            "1246": 36.0,
+            "1247": 31.0,
+            "1248": 38.0,
+            "1249": 27.0,
+            "1250": 40.0,
+            "1251": 26.0,
+            "1252": 42.0,
+            "1253": 32.0,
+            "1254": 39.0,
+            "1255": 46.0,
+            "1256": 41.0,
+            "1257": 30.0,
+            "1258": 44.0,
+            "1259": 32.0,
+            "1260": 25.0,
+            "1261": 42.0,
+            "1262": 36.0,
+            "1263": 34.0,
+            "1264": 32.0,
+            "1265": 35.0,
+            "1266": 34.0,
+            "1267": 38.0,
+            "1268": 43.0,
+            "1269": 30.0,
+            "1270": 28.0,
+            "1271": 42.0,
+            "1272": 32.0,
+            "1273": 40.0,
+            "1274": 44.0,
+            "1275": 38.0,
+            "1276": 31.0,
+            "1277": 54.0,
+            "1278": 46.0,
+            "1279": 44.0,
+            "1280": 34.0,
+            "1281": 26.0,
+            "1282": 37.0,
+            "1283": 32.0,
+            "1284": 43.0,
+            "1285": 43.0,
+            "1286": 36.0,
+            "1287": 46.0,
+            "1288": 33.0,
+            "1289": 43.0,
+            "1290": 37.0,
+            "1291": 42.0,
+            "1292": 38.0,
+            "1293": 43.0,
+            "1294": 30.0,
+            "1295": 34.0,
+            "1296": 31.0,
+            "1297": 26.0,
+            "1298": 38.0,
+            "1299": 40.0,
+            "1300": 32.0,
+            "1301": 43.0,
+            "1302": 35.0,
+            "1303": 35.0,
+            "1304": 41.0,
+            "1305": 30.0,
+            "1306": 28.0,
+            "1307": 34.0,
+            "1308": 32.0,
+            "1309": 36.0,
+            "1310": 29.0,
+            "1311": 43.0,
+            "1312": 32.0,
+            "1313": 37.0,
+            "1314": 35.0,
+            "1315": 33.0,
+            "1316": 37.0,
+            "1317": 33.0,
+            "1318": 41.0,
+            "1319": 28.0,
+            "1320": 42.0,
+            "1321": 30.0,
+            "1322": 21.0,
+            "1323": 28.0,
+            "1324": 40.0,
+            "1325": 36.0,
+            "1326": 43.0,
+            "1327": 32.0,
+            "1328": 35.0,
+            "1329": 33.0,
+            "1330": 27.0,
+            "1331": 30.0,
+            "1332": 36.0,
+            "1333": 45.0,
+            "1334": 32.0,
+            "1335": 41.0,
+            "1336": 38.0,
+            "1337": 37.0,
+            "1338": 38.0,
+            "1339": 27.0,
+            "1340": 33.0,
+            "1341": 47.0,
+            "1342": 24.0,
+            "1343": 27.0,
+            "1344": 34.0,
+            "1345": 34.0,
+            "1346": 21.0,
+            "1347": 33.0,
+            "1348": 33.0,
+            "1349": 42.0,
+            "1350": 30.0,
+            "1351": 39.0,
+            "1352": 26.0,
+            "1353": 36.0,
+            "1354": 40.0,
+            "1355": 31.0,
+            "1356": 46.0,
+            "1357": 46.0,
+            "1358": 29.0,
+            "1359": 29.0,
+            "1360": 30.0,
+            "1361": 35.0,
+            "1362": 40.0,
+            "1363": 33.0,
+            "1364": 36.0,
+            "1365": 34.0,
+            "1366": 47.0,
+            "1367": 31.0,
+            "1368": 37.0,
+            "1369": 28.0,
+            "1370": 41.0,
+            "1371": 30.0,
+            "1372": 42.0,
+            "1373": 44.0,
+            "1374": 34.0,
+            "1375": 22.0,
+            "1376": 47.0,
+            "1377": 29.0,
+            "1378": 39.0,
+            "1379": 49.0,
+            "1380": 44.0,
+            "1381": 30.0,
+            "1382": 45.0,
+            "1383": 44.0,
+            "1384": 31.0,
+            "1385": 35.0,
+            "1386": 31.0,
+            "1387": 31.0,
+            "1388": 22.0,
+            "1389": 32.0,
+            "1390": 38.0,
+            "1391": 42.0,
+            "1392": 34.0,
+            "1393": 43.0,
+            "1394": 33.0,
+            "1395": 39.0,
+            "1396": 37.0,
+            "1397": 27.0,
+            "1398": 33.0,
+            "1399": 29.0,
+            "1400": 36.0,
+            "1401": 28.0,
+            "1402": 27.0,
+            "1403": 23.0,
+            "1404": 28.0,
+            "1405": 36.0,
+            "1406": 29.0,
+            "1407": 36.0,
+            "1408": 43.0,
+            "1409": 37.0,
+            "1410": 37.0,
+            "1411": 38.0,
+            "1412": 28.0,
+            "1413": 48.0,
+            "1414": 34.0,
+            "1415": 42.0,
+            "1416": 35.0,
+            "1417": 34.0,
+            "1418": 43.0,
+            "1419": 38.0,
+            "1420": 33.0,
+            "1421": 33.0,
+            "1422": 53.0,
+            "1423": 22.0,
+            "1424": 35.0,
+            "1425": 43.0,
+            "1426": 36.0,
+            "1427": 43.0,
+            "1428": 31.0,
+            "1429": 30.0,
+            "1430": 36.0,
+            "1431": 29.0,
+            "1432": 37.0,
+            "1433": 32.0,
+            "1434": 47.0,
+            "1435": 38.0,
+            "1436": 40.0,
+            "1437": 47.0,
+            "1438": 28.0,
+            "1439": 33.0,
+            "1440": 25.0,
+            "1441": 35.0,
+            "1442": 38.0,
+            "1443": 42.0,
+            "1444": 28.0,
+            "1445": 34.0,
+            "1446": 28.0,
+            "1447": 39.0,
+            "1448": 45.0,
+            "1449": 41.0,
+            "1450": 25.0,
+            "1451": 38.0,
+            "1452": 27.0,
+            "1453": 28.0,
+            "1454": 28.0,
+            "1455": 32.0,
+            "1456": 40.0,
+            "1457": 33.0,
+            "1458": 37.0,
+            "1459": 41.0,
+            "1460": 31.0,
+            "1461": 34.0,
+            "1462": 23.0,
+            "1463": 33.0,
+            "1464": 42.0,
+            "1465": 42.0,
+            "1466": 29.0,
+            "1467": 27.0,
+            "1468": 41.0,
+            "1469": 30.0,
+            "1470": 35.0,
+            "1471": 32.0,
+            "1472": 44.0,
+            "1473": 53.0,
+            "1474": 28.0,
+            "1475": 25.0,
+            "1476": 47.0,
+            "1477": 40.0,
+            "1478": 26.0,
+            "1479": 33.0,
+            "1480": 33.0,
+            "1481": 33.0,
+            "1482": 33.0,
+            "1483": 31.0,
+            "1484": 31.0,
+            "1485": 45.0,
+            "1486": 37.0,
+            "1487": 32.0,
+            "1488": 26.0,
+            "1489": 45.0,
+            "1490": 40.0,
+            "1491": 44.0,
+            "1492": 44.0,
+            "1493": 44.0,
+            "1494": 33.0,
+            "1495": 42.0,
+            "1496": 32.0,
+            "1497": 39.0,
+            "1498": 32.0,
+            "1499": 42.0,
+            "1500": 42.0,
+            "1501": 46.0,
+            "1502": 46.0,
+            "1503": 39.0,
+            "1504": 31.0,
+            "1505": 47.0,
+            "1506": 41.0,
+            "1507": 35.0,
+            "1508": 39.0,
+            "1509": 32.0,
+            "1510": 37.0,
+            "1511": 52.0,
+            "1512": 29.0,
+            "1513": 46.0,
+            "1514": 40.0,
+            "1515": 41.0,
+            "1516": 31.0,
+            "1517": 39.0,
+            "1518": 40.0,
+            "1519": 32.0,
+            "1520": 34.0,
+            "1521": 44.0,
+            "1522": 53.0,
+            "1523": 40.0,
+            "1524": 39.0,
+            "1525": 30.0,
+            "1526": 34.0,
+            "1527": 19.0,
+            "1528": 40.0,
+            "1529": 30.0,
+            "1530": 38.0,
+            "1531": 28.0,
+            "1532": 30.0,
+            "1533": 43.0,
+            "1534": 34.0,
+            "1535": 35.0,
+            "1536": 34.0,
+            "1537": 33.0,
+            "1538": 36.0,
+            "1539": 32.0,
+            "1540": 38.0,
+            "1541": 35.0,
+            "1542": 50.0,
+            "1543": 50.0,
+            "1544": 38.0,
+            "1545": 38.0,
+            "1546": 35.0,
+            "1547": 31.0,
+            "1548": 39.0,
+            "1549": 36.0,
+            "1550": 30.0,
+            "1551": 42.0,
+            "1552": 49.0,
+            "1553": 46.0,
+            "1554": 41.0,
+            "1555": 25.0,
+            "1556": 33.0,
+            "1557": 46.0,
+            "1558": 43.0,
+            "1559": 36.0,
+            "1560": 30.0,
+            "1561": 48.0,
+            "1562": 30.0,
+            "1563": 38.0,
+            "1564": 40.0,
+            "1565": 30.0,
+            "1566": 34.0,
+            "1567": 36.0,
+            "1568": 43.0,
+            "1569": 35.0,
+            "1570": 43.0,
+            "1571": 32.0,
+            "1572": 34.0,
+            "1573": 35.0,
+            "1574": 31.0,
+            "1575": 39.0,
+            "1576": 30.0,
+            "1577": 41.0,
+            "1578": 46.0,
+            "1579": 35.0,
+            "1580": 39.0,
+            "1581": 43.0,
+            "1582": 30.0,
+            "1583": 43.0,
+            "1584": 36.0,
+            "1585": 37.0,
+            "1586": 44.0,
+            "1587": 37.0,
+            "1588": 43.0,
+            "1589": 41.0,
+            "1590": 46.0,
+            "1591": 32.0,
+            "1592": 37.0,
+            "1593": 32.0,
+            "1594": 36.0,
+            "1595": 27.0,
+            "1596": 40.0,
+            "1597": 36.0,
+            "1598": 36.0,
+            "1599": 32.0,
+            "1600": 41.0,
+            "1601": 34.0,
+            "1602": 38.0,
+            "1603": 48.0,
+            "1604": 29.0,
+            "1605": 42.0,
+            "1606": 33.0,
+            "1607": 41.0,
+            "1608": 40.0,
+            "1609": 42.0,
+            "1610": 37.0,
+            "1611": 35.0,
+            "1612": 37.0,
+            "1613": 39.0,
+            "1614": 51.0,
+            "1615": 38.0,
+            "1616": 33.0,
+            "1617": 45.0,
+            "1618": 43.0,
+            "1619": 32.0,
+            "1620": 43.0,
+            "1621": 47.0,
+            "1622": 36.0,
+            "1623": 50.0,
+            "1624": 40.0,
+            "1625": 33.0,
+            "1626": 39.0,
+            "1627": 34.0,
+            "1628": 40.0,
+            "1629": 30.0,
+            "1630": 34.0,
+            "1631": 45.0,
+            "1632": 39.0,
+            "1633": 40.0,
+            "1634": 30.0,
+            "1635": 53.0,
+            "1636": 31.0,
+            "1637": 35.0,
+            "1638": 39.0,
+            "1639": 42.0,
+            "1640": 37.0,
+            "1641": 43.0,
+            "1642": 30.0,
+            "1643": 43.0,
+            "1644": 36.0,
+            "1645": 37.0,
+            "1646": 61.0,
+            "1647": 34.0,
+            "1648": 41.0,
+            "1649": 39.0,
+            "1650": 42.0,
+            "1651": 33.0,
+            "1652": 45.0,
+            "1653": 25.0,
+            "1654": 36.0,
+            "1655": 29.0,
+            "1656": 45.0,
+            "1657": 37.0,
+            "1658": 46.0,
+            "1659": 38.0,
+            "1660": 46.0,
+            "1661": 41.0,
+            "1662": 35.0,
+            "1663": 35.0,
+            "1664": 37.0,
+            "1665": 30.0,
+            "1666": 44.0,
+            "1667": 45.0,
+            "1668": 40.0,
+            "1669": 35.0,
+            "1670": 35.0,
+            "1671": 37.0,
+            "1672": 32.0,
+            "1673": 48.0,
+            "1674": 41.0,
+            "1675": 40.0,
+            "1676": 49.0,
+            "1677": 35.0,
+            "1678": 30.0,
+            "1679": 45.0,
+            "1680": 40.0,
+            "1681": 32.0,
+            "1682": 32.0,
+            "1683": 42.0,
+            "1684": 44.0,
+            "1685": 47.0,
+            "1686": 30.0,
+            "1687": 31.0,
+            "1688": 31.0,
+            "1689": 40.0,
+            "1690": 43.0,
+            "1691": 36.0,
+            "1692": 31.0,
+            "1693": 31.0,
+            "1694": 35.0,
+            "1695": 41.0,
+            "1696": 32.0,
+            "1697": 27.0,
+            "1698": 39.0,
+            "1699": 41.0,
+            "1700": 31.0,
+            "1701": 35.0,
+            "1702": 31.0,
+            "1703": 40.0,
+            "1704": 36.0,
+            "1705": 36.0,
+            "1706": 46.0,
+            "1707": 26.0,
+            "1708": 37.0,
+            "1709": 37.0,
+            "1710": 39.0,
+            "1711": 32.0,
+            "1712": 46.0,
+            "1713": 44.0,
+            "1714": 45.0,
+            "1715": 43.0,
+            "1716": 30.0,
+            "1717": 41.0,
+            "1718": 43.0,
+            "1719": 28.0,
+            "1720": 36.0,
+            "1721": 26.0,
+            "1722": 42.0,
+            "1723": 42.0,
+            "1724": 39.0,
+            "1725": 28.0,
+            "1726": 46.0,
+            "1727": 43.0,
+            "1728": 40.0,
+            "1729": 44.0,
+            "1730": 38.0,
+            "1731": 26.0,
+            "1732": 39.0,
+            "1733": 44.0,
+            "1734": 39.0,
+            "1735": 34.0,
+            "1736": 46.0,
+            "1737": 46.0,
+            "1738": 34.0,
+            "1739": 47.0,
+            "1740": 44.0,
+            "1741": 31.0,
+            "1742": 46.0,
+            "1743": 43.0,
+            "1744": 46.0,
+            "1745": 53.0,
+            "1746": 42.0,
+            "1747": 37.0,
+            "1748": 37.0,
+            "1749": 47.0,
+            "1750": 46.0,
+            "1751": 43.0,
+            "1752": 35.0,
+            "1753": 41.0,
+            "1754": 40.0,
+            "1755": 32.0,
+            "1756": 36.0,
+            "1757": 48.0,
+            "1758": 34.0,
+            "1759": 49.0,
+            "1760": 46.0,
+            "1761": 36.0,
+            "1762": 34.0,
+            "1763": 36.0,
+            "1764": 39.0,
+            "1765": 24.0,
+            "1766": 46.0,
+            "1767": 46.0,
+            "1768": 36.0,
+            "1769": 56.0,
+            "1770": 28.0,
+            "1771": 42.0,
+            "1772": 52.0,
+            "1773": 45.0,
+            "1774": 37.0,
+            "1775": 33.0,
+            "1776": 43.0,
+            "1777": 54.0,
+            "1778": 39.0,
+            "1779": 33.0,
+            "1780": 39.0,
+            "1781": 45.0,
+            "1782": 35.0,
+            "1783": 43.0,
+            "1784": 53.0,
+            "1785": 36.0,
+            "1786": 38.0,
+            "1787": 43.0,
+            "1788": 45.0,
+            "1789": 33.0,
+            "1790": 42.0,
+            "1791": 44.0,
+            "1792": 34.0,
+            "1793": 30.0,
+            "1794": 40.0,
+            "1795": 55.0,
+            "1796": 33.0,
+            "1797": 30.0,
+            "1798": 41.0,
+            "1799": 37.0,
+            "1800": 41.0,
+            "1801": 40.0,
+            "1802": 30.0,
+            "1803": 36.0,
+            "1804": 41.0,
+            "1805": 34.0,
+            "1806": 39.0,
+            "1807": 36.0,
+            "1808": 43.0,
+            "1809": 45.0,
+            "1810": 41.0,
+            "1811": 28.0,
+            "1812": 33.0,
+            "1813": 30.0,
+            "1814": 36.0,
+            "1815": 35.0,
+            "1816": 35.0,
+            "1817": 35.0,
+            "1818": 42.0,
+            "1819": 25.0,
+            "1820": 38.0,
+            "1821": 48.0,
+            "1822": 38.0,
+            "1823": 38.0,
+            "1824": 49.0,
+            "1825": 46.0,
+            "1826": 32.0,
+            "1827": 47.0,
+            "1828": 30.0,
+            "1829": 50.0,
+            "1830": 43.0,
+            "1831": 36.0,
+            "1832": 47.0,
+            "1833": 42.0,
+            "1834": 41.0,
+            "1835": 39.0,
+            "1836": 39.0,
+            "1837": 34.0,
+            "1838": 50.0,
+            "1839": 35.0,
+            "1840": 41.0,
+            "1841": 30.0,
+            "1842": 34.0,
+            "1843": 44.0,
+            "1844": 38.0,
+            "1845": 41.0,
+            "1846": 32.0,
+            "1847": 32.0,
+            "1848": 36.0,
+            "1849": 45.0,
+            "1850": 40.0,
+            "1851": 36.0,
+            "1852": 41.0,
+            "1853": 29.0,
+            "1854": 35.0,
+            "1855": 45.0,
+            "1856": 39.0,
+            "1857": 33.0,
+            "1858": 40.0,
+            "1859": 40.0,
+            "1860": 48.0,
+            "1861": 37.0,
+            "1862": 46.0,
+            "1863": 47.0,
+            "1864": 48.0,
+            "1865": 38.0,
+            "1866": 51.0,
+            "1867": 34.0,
+            "1868": 40.0,
+            "1869": 42.0,
+            "1870": 38.0,
+            "1871": 36.0,
+            "1872": 42.0,
+            "1873": 42.0,
+            "1874": 38.0,
+            "1875": 51.0,
+            "1876": 39.0,
+            "1877": 41.0,
+            "1878": 26.0,
+            "1879": 33.0,
+            "1880": 41.0,
+            "1881": 50.0,
+            "1882": 37.0,
+            "1883": 45.0,
+            "1884": 39.0,
+            "1885": 37.0,
+            "1886": 32.0,
+            "1887": 36.0,
+            "1888": 28.0,
+            "1889": 38.0,
+            "1890": 37.0,
+            "1891": 51.0,
+            "1892": 44.0,
+            "1893": 50.0,
+            "1894": 44.0,
+            "1895": 35.0,
+            "1896": 34.0,
+            "1897": 35.0,
+            "1898": 31.0,
+            "1899": 39.0,
+            "1900": 40.0,
+            "1901": 52.0,
+            "1902": 31.0,
+            "1903": 44.0,
+            "1904": 45.0,
+            "1905": 32.0,
+            "1906": 49.0,
+            "1907": 34.0,
+            "1908": 33.0,
+            "1909": 34.0,
+            "1910": 45.0,
+            "1911": 41.0,
+            "1912": 46.0,
+            "1913": 46.0,
+            "1914": 51.0,
+            "1915": 35.0,
+            "1916": 42.0,
+            "1917": 40.0,
+            "1918": 32.0,
+            "1919": 54.0,
+            "1920": 41.0,
+            "1921": 40.0,
+            "1922": 36.0,
+            "1923": 34.0,
+            "1924": 43.0,
+            "1925": 47.0,
+            "1926": 42.0,
+            "1927": 37.0,
+            "1928": 40.0,
+            "1929": 40.0,
+            "1930": 39.0,
+            "1931": 37.0,
+            "1932": 40.0,
+            "1933": 46.0,
+            "1934": 30.0,
+            "1935": 50.0,
+            "1936": 51.0,
+            "1937": 34.0,
+            "1938": 38.0,
+            "1939": 44.0,
+            "1940": 35.0,
+            "1941": 39.0,
+            "1942": 59.0,
+            "1943": 42.0,
+            "1944": 46.0,
+            "1945": 36.0,
+            "1946": 43.0,
+            "1947": 39.0,
+            "1948": 39.0,
+            "1949": 31.0,
+            "1950": 36.0,
+            "1951": 41.0,
+            "1952": 37.0,
+            "1953": 26.0,
+            "1954": 43.0,
+            "1955": 33.0,
+            "1956": 37.0,
+            "1957": 48.0,
+            "1958": 35.0,
+            "1959": 44.0,
+            "1960": 35.0,
+            "1961": 28.0,
+            "1962": 51.0,
+            "1963": 47.0,
+            "1964": 33.0,
+            "1965": 56.0,
+            "1966": 46.0,
+            "1967": 33.0,
+            "1968": 53.0,
+            "1969": 36.0,
+            "1970": 47.0,
+            "1971": 35.0,
+            "1972": 34.0,
+            "1973": 38.0,
+            "1974": 46.0,
+            "1975": 32.0,
+            "1976": 43.0,
+            "1977": 38.0,
+            "1978": 43.0,
+            "1979": 49.0,
+            "1980": 32.0,
+            "1981": 30.0,
+            "1982": 55.0,
+            "1983": 41.0,
+            "1984": 62.0,
+            "1985": 41.0,
+            "1986": 48.0,
+            "1987": 48.0,
+            "1988": 41.0,
+            "1989": 50.0,
+            "1990": 53.0,
+            "1991": 45.0,
+            "1992": 46.0,
+            "1993": 60.0,
+            "1994": 30.0,
+            "1995": 41.0,
+            "1996": 51.0,
+            "1997": 41.0,
+            "1998": 45.0,
+            "1999": 32.0,
+            "2000": 43.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 302618112.0,
+            "2": 302618112.0,
+            "3": 302618112.0,
+            "4": 302618112.0,
+            "5": 302618112.0,
+            "6": 302618112.0,
+            "7": 302618112.0,
+            "8": 302618112.0,
+            "9": 302618112.0,
+            "10": 302618112.0,
+            "11": 302618112.0,
+            "12": 302618112.0,
+            "13": 302618112.0,
+            "14": 302618112.0,
+            "15": 302618112.0,
+            "16": 302618112.0,
+            "17": 302618112.0,
+            "18": 302618112.0,
+            "19": 302618112.0,
+            "20": 302618112.0,
+            "21": 302618112.0,
+            "22": 302618112.0,
+            "23": 302618112.0,
+            "24": 302618112.0,
+            "25": 302618112.0,
+            "26": 302618112.0,
+            "27": 302618112.0,
+            "28": 302618112.0,
+            "29": 302618112.0,
+            "30": 302618112.0,
+            "31": 302618112.0,
+            "32": 302618112.0,
+            "33": 302618112.0,
+            "34": 302618112.0,
+            "35": 302618112.0,
+            "36": 302618112.0,
+            "37": 302618112.0,
+            "38": 302618112.0,
+            "39": 302618112.0,
+            "40": 302618112.0,
+            "41": 302618112.0,
+            "42": 302618112.0,
+            "43": 302618112.0,
+            "44": 302618112.0,
+            "45": 302618112.0,
+            "46": 302618112.0,
+            "47": 302618112.0,
+            "48": 302618112.0,
+            "49": 302618112.0,
+            "50": 302618112.0,
+            "51": 302618112.0,
+            "52": 302618112.0,
+            "53": 302618112.0,
+            "54": 302618112.0,
+            "55": 302618112.0,
+            "56": 302618112.0,
+            "57": 302618112.0,
+            "58": 302618112.0,
+            "59": 302618112.0,
+            "60": 302618112.0,
+            "61": 302618112.0,
+            "62": 302618112.0,
+            "63": 302618112.0,
+            "64": 302618112.0,
+            "65": 302618112.0,
+            "66": 302618112.0,
+            "67": 302618112.0,
+            "68": 302618112.0,
+            "69": 302618112.0,
+            "70": 302618112.0,
+            "71": 302618112.0,
+            "72": 302618112.0,
+            "73": 302618112.0,
+            "74": 302618112.0,
+            "75": 302618112.0,
+            "76": 302618112.0,
+            "77": 302618112.0,
+            "78": 302618112.0,
+            "79": 302618112.0,
+            "80": 302618112.0,
+            "81": 302618112.0,
+            "82": 302618112.0,
+            "83": 302618112.0,
+            "84": 302618112.0,
+            "85": 302618112.0,
+            "86": 302618112.0,
+            "87": 302618112.0,
+            "88": 302618112.0,
+            "89": 302618112.0,
+            "90": 302618112.0,
+            "91": 302618112.0,
+            "92": 302618112.0,
+            "93": 302618112.0,
+            "94": 302618112.0,
+            "95": 302618112.0,
+            "96": 302618112.0,
+            "97": 302618112.0,
+            "98": 302618112.0,
+            "99": 302618112.0,
+            "100": 302618112.0,
+            "101": 302618112.0,
+            "102": 302618112.0,
+            "103": 302618112.0,
+            "104": 302618112.0,
+            "105": 302618112.0,
+            "106": 302618112.0,
+            "107": 302618112.0,
+            "108": 302618112.0,
+            "109": 302618112.0,
+            "110": 302618112.0,
+            "111": 302618112.0,
+            "112": 302618112.0,
+            "113": 302618112.0,
+            "114": 302618112.0,
+            "115": 302618112.0,
+            "116": 302618112.0,
+            "117": 302618112.0,
+            "118": 302618112.0,
+            "119": 302618112.0,
+            "120": 302618112.0,
+            "121": 302618112.0,
+            "122": 302618112.0,
+            "123": 302618112.0,
+            "124": 302618112.0,
+            "125": 302618112.0,
+            "126": 302618112.0,
+            "127": 302618112.0,
+            "128": 302618112.0,
+            "129": 302618112.0,
+            "130": 302618112.0,
+            "131": 302618112.0,
+            "132": 302618112.0,
+            "133": 302618112.0,
+            "134": 302618112.0,
+            "135": 302618112.0,
+            "136": 302618112.0,
+            "137": 302618112.0,
+            "138": 302618112.0,
+            "139": 302618112.0,
+            "140": 302618112.0,
+            "141": 302618112.0,
+            "142": 302618112.0,
+            "143": 302618112.0,
+            "144": 302618112.0,
+            "145": 302618112.0,
+            "146": 302618112.0,
+            "147": 302618112.0,
+            "148": 302618112.0,
+            "149": 302618112.0,
+            "150": 302618112.0,
+            "151": 302618112.0,
+            "152": 302618112.0,
+            "153": 302618112.0,
+            "154": 302618112.0,
+            "155": 302618112.0,
+            "156": 302618112.0,
+            "157": 302618112.0,
+            "158": 302618112.0,
+            "159": 302618112.0,
+            "160": 302618112.0,
+            "161": 302618112.0,
+            "162": 302618112.0,
+            "163": 302618112.0,
+            "164": 302618112.0,
+            "165": 302618112.0,
+            "166": 302618112.0,
+            "167": 302618112.0,
+            "168": 302618112.0,
+            "169": 302618112.0,
+            "170": 302618112.0,
+            "171": 302618112.0,
+            "172": 302618112.0,
+            "173": 302618112.0,
+            "174": 302618112.0,
+            "175": 302618112.0,
+            "176": 302618112.0,
+            "177": 302618112.0,
+            "178": 302618112.0,
+            "179": 302618112.0,
+            "180": 302618112.0,
+            "181": 302618112.0,
+            "182": 302618112.0,
+            "183": 302618112.0,
+            "184": 302618112.0,
+            "185": 302618112.0,
+            "186": 302618112.0,
+            "187": 302618112.0,
+            "188": 302618112.0,
+            "189": 302618112.0,
+            "190": 302618112.0,
+            "191": 302618112.0,
+            "192": 302618112.0,
+            "193": 302618112.0,
+            "194": 302618112.0,
+            "195": 302618112.0,
+            "196": 302618112.0,
+            "197": 302618112.0,
+            "198": 302618112.0,
+            "199": 302618112.0,
+            "200": 302618112.0,
+            "201": 302618112.0,
+            "202": 302618112.0,
+            "203": 302618112.0,
+            "204": 302618112.0,
+            "205": 302618112.0,
+            "206": 302618112.0,
+            "207": 302618112.0,
+            "208": 302618112.0,
+            "209": 302618112.0,
+            "210": 302618112.0,
+            "211": 302618112.0,
+            "212": 302618112.0,
+            "213": 302618112.0,
+            "214": 302618112.0,
+            "215": 302618112.0,
+            "216": 302618112.0,
+            "217": 302618112.0,
+            "218": 302618112.0,
+            "219": 302618112.0,
+            "220": 302618112.0,
+            "221": 302618112.0,
+            "222": 302618112.0,
+            "223": 302618112.0,
+            "224": 302618112.0,
+            "225": 302618112.0,
+            "226": 302618112.0,
+            "227": 302618112.0,
+            "228": 302618112.0,
+            "229": 302618112.0,
+            "230": 302618112.0,
+            "231": 302618112.0,
+            "232": 302618112.0,
+            "233": 302618112.0,
+            "234": 302618112.0,
+            "235": 302618112.0,
+            "236": 302618112.0,
+            "237": 302618112.0,
+            "238": 302618112.0,
+            "239": 302618112.0,
+            "240": 302618112.0,
+            "241": 302618112.0,
+            "242": 302618112.0,
+            "243": 302618112.0,
+            "244": 302618112.0,
+            "245": 302618112.0,
+            "246": 302618112.0,
+            "247": 302618112.0,
+            "248": 302618112.0,
+            "249": 302618112.0,
+            "250": 302618112.0,
+            "251": 302618112.0,
+            "252": 302618112.0,
+            "253": 302618112.0,
+            "254": 302618112.0,
+            "255": 302618112.0,
+            "256": 302618112.0,
+            "257": 302618112.0,
+            "258": 302618112.0,
+            "259": 302618112.0,
+            "260": 302618112.0,
+            "261": 302618112.0,
+            "262": 302618112.0,
+            "263": 302618112.0,
+            "264": 302618112.0,
+            "265": 302618112.0,
+            "266": 302618112.0,
+            "267": 302618112.0,
+            "268": 302618112.0,
+            "269": 302618112.0,
+            "270": 302618112.0,
+            "271": 302618112.0,
+            "272": 302618112.0,
+            "273": 302618112.0,
+            "274": 302618112.0,
+            "275": 302618112.0,
+            "276": 302618112.0,
+            "277": 302618112.0,
+            "278": 302618112.0,
+            "279": 302618112.0,
+            "280": 302618112.0,
+            "281": 302618112.0,
+            "282": 302618112.0,
+            "283": 302618112.0,
+            "284": 302618112.0,
+            "285": 302618112.0,
+            "286": 302618112.0,
+            "287": 302618112.0,
+            "288": 302618112.0,
+            "289": 302618112.0,
+            "290": 302618112.0,
+            "291": 302618112.0,
+            "292": 302618112.0,
+            "293": 302618112.0,
+            "294": 302618112.0,
+            "295": 302618112.0,
+            "296": 302618112.0,
+            "297": 302618112.0,
+            "298": 302618112.0,
+            "299": 302618112.0,
+            "300": 302618112.0,
+            "301": 302618112.0,
+            "302": 302618112.0,
+            "303": 302618112.0,
+            "304": 302618112.0,
+            "305": 302618112.0,
+            "306": 302618112.0,
+            "307": 302618112.0,
+            "308": 302618112.0,
+            "309": 302618112.0,
+            "310": 302618112.0,
+            "311": 302618112.0,
+            "312": 302618112.0,
+            "313": 302618112.0,
+            "314": 302618112.0,
+            "315": 302618112.0,
+            "316": 302618112.0,
+            "317": 302618112.0,
+            "318": 302618112.0,
+            "319": 302618112.0,
+            "320": 302618112.0,
+            "321": 302618112.0,
+            "322": 302618112.0,
+            "323": 302618112.0,
+            "324": 302618112.0,
+            "325": 302618112.0,
+            "326": 302618112.0,
+            "327": 302618112.0,
+            "328": 302618112.0,
+            "329": 302618112.0,
+            "330": 302618112.0,
+            "331": 302618112.0,
+            "332": 302618112.0,
+            "333": 302618112.0,
+            "334": 302618112.0,
+            "335": 302618112.0,
+            "336": 302618112.0,
+            "337": 302618112.0,
+            "338": 302618112.0,
+            "339": 302618112.0,
+            "340": 302618112.0,
+            "341": 302618112.0,
+            "342": 302618112.0,
+            "343": 302618112.0,
+            "344": 302618112.0,
+            "345": 302618112.0,
+            "346": 302618112.0,
+            "347": 302618112.0,
+            "348": 302618112.0,
+            "349": 302618112.0,
+            "350": 302618112.0,
+            "351": 302618112.0,
+            "352": 302618112.0,
+            "353": 302618112.0,
+            "354": 302618112.0,
+            "355": 302618112.0,
+            "356": 302618112.0,
+            "357": 302618112.0,
+            "358": 302618112.0,
+            "359": 302618112.0,
+            "360": 302618112.0,
+            "361": 302618112.0,
+            "362": 302618112.0,
+            "363": 302618112.0,
+            "364": 302618112.0,
+            "365": 302618112.0,
+            "366": 302618112.0,
+            "367": 302618112.0,
+            "368": 302618112.0,
+            "369": 302618112.0,
+            "370": 302618112.0,
+            "371": 302618112.0,
+            "372": 302618112.0,
+            "373": 302618112.0,
+            "374": 302618112.0,
+            "375": 302618112.0,
+            "376": 302618112.0,
+            "377": 302618112.0,
+            "378": 302618112.0,
+            "379": 302618112.0,
+            "380": 302618112.0,
+            "381": 302618112.0,
+            "382": 302618112.0,
+            "383": 302618112.0,
+            "384": 302618112.0,
+            "385": 302618112.0,
+            "386": 302618112.0,
+            "387": 302618112.0,
+            "388": 302618112.0,
+            "389": 302618112.0,
+            "390": 302618112.0,
+            "391": 302618112.0,
+            "392": 302618112.0,
+            "393": 302618112.0,
+            "394": 302618112.0,
+            "395": 302618112.0,
+            "396": 302618112.0,
+            "397": 302618112.0,
+            "398": 302618112.0,
+            "399": 302618112.0,
+            "400": 302618112.0,
+            "401": 302618112.0,
+            "402": 302618112.0,
+            "403": 302618112.0,
+            "404": 302618112.0,
+            "405": 302618112.0,
+            "406": 302618112.0,
+            "407": 302618112.0,
+            "408": 302618112.0,
+            "409": 302618112.0,
+            "410": 302618112.0,
+            "411": 302618112.0,
+            "412": 302618112.0,
+            "413": 302618112.0,
+            "414": 302618112.0,
+            "415": 302618112.0,
+            "416": 302618112.0,
+            "417": 302618112.0,
+            "418": 302618112.0,
+            "419": 302618112.0,
+            "420": 302618112.0,
+            "421": 302618112.0,
+            "422": 302618112.0,
+            "423": 302618112.0,
+            "424": 302618112.0,
+            "425": 302618112.0,
+            "426": 302618112.0,
+            "427": 302618112.0,
+            "428": 302618112.0,
+            "429": 302618112.0,
+            "430": 302618112.0,
+            "431": 302618112.0,
+            "432": 302618112.0,
+            "433": 302618112.0,
+            "434": 302618112.0,
+            "435": 302618112.0,
+            "436": 302618112.0,
+            "437": 302618112.0,
+            "438": 302618112.0,
+            "439": 302618112.0,
+            "440": 302618112.0,
+            "441": 302618112.0,
+            "442": 302618112.0,
+            "443": 302618112.0,
+            "444": 302618112.0,
+            "445": 302618112.0,
+            "446": 302618112.0,
+            "447": 302618112.0,
+            "448": 302618112.0,
+            "449": 302618112.0,
+            "450": 302618112.0,
+            "451": 302618112.0,
+            "452": 302618112.0,
+            "453": 302618112.0,
+            "454": 302618112.0,
+            "455": 302618112.0,
+            "456": 302618112.0,
+            "457": 302618112.0,
+            "458": 302618112.0,
+            "459": 302618112.0,
+            "460": 302618112.0,
+            "461": 302618112.0,
+            "462": 302618112.0,
+            "463": 302618112.0,
+            "464": 302618112.0,
+            "465": 302618112.0,
+            "466": 302618112.0,
+            "467": 302618112.0,
+            "468": 302618112.0,
+            "469": 302618112.0,
+            "470": 302618112.0,
+            "471": 302618112.0,
+            "472": 302618112.0,
+            "473": 302618112.0,
+            "474": 302618112.0,
+            "475": 302618112.0,
+            "476": 302618112.0,
+            "477": 302618112.0,
+            "478": 302618112.0,
+            "479": 302618112.0,
+            "480": 302618112.0,
+            "481": 302618112.0,
+            "482": 302618112.0,
+            "483": 302618112.0,
+            "484": 302618112.0,
+            "485": 302618112.0,
+            "486": 302618112.0,
+            "487": 302618112.0,
+            "488": 302618112.0,
+            "489": 302618112.0,
+            "490": 302618112.0,
+            "491": 302618112.0,
+            "492": 302618112.0,
+            "493": 302618112.0,
+            "494": 302618112.0,
+            "495": 302618112.0,
+            "496": 302618112.0,
+            "497": 302618112.0,
+            "498": 302618112.0,
+            "499": 302618112.0,
+            "500": 302618112.0,
+            "501": 302618112.0,
+            "502": 302618112.0,
+            "503": 302618112.0,
+            "504": 302618112.0,
+            "505": 302618112.0,
+            "506": 302618112.0,
+            "507": 302618112.0,
+            "508": 302618112.0,
+            "509": 302618112.0,
+            "510": 302618112.0,
+            "511": 302618112.0,
+            "512": 302618112.0,
+            "513": 302618112.0,
+            "514": 302618112.0,
+            "515": 302618112.0,
+            "516": 302618112.0,
+            "517": 302618112.0,
+            "518": 302618112.0,
+            "519": 302618112.0,
+            "520": 302618112.0,
+            "521": 302618112.0,
+            "522": 302618112.0,
+            "523": 302618112.0,
+            "524": 302618112.0,
+            "525": 302618112.0,
+            "526": 302618112.0,
+            "527": 302618112.0,
+            "528": 302618112.0,
+            "529": 302618112.0,
+            "530": 302618112.0,
+            "531": 302618112.0,
+            "532": 302618112.0,
+            "533": 302618112.0,
+            "534": 302618112.0,
+            "535": 302618112.0,
+            "536": 302618112.0,
+            "537": 302618112.0,
+            "538": 302618112.0,
+            "539": 302618112.0,
+            "540": 302618112.0,
+            "541": 302618112.0,
+            "542": 302618112.0,
+            "543": 302618112.0,
+            "544": 302618112.0,
+            "545": 302618112.0,
+            "546": 302618112.0,
+            "547": 302618112.0,
+            "548": 302618112.0,
+            "549": 302618112.0,
+            "550": 302618112.0,
+            "551": 302618112.0,
+            "552": 302618112.0,
+            "553": 302618112.0,
+            "554": 302618112.0,
+            "555": 302618112.0,
+            "556": 302618112.0,
+            "557": 302618112.0,
+            "558": 302618112.0,
+            "559": 302618112.0,
+            "560": 302618112.0,
+            "561": 302618112.0,
+            "562": 302618112.0,
+            "563": 302618112.0,
+            "564": 302618112.0,
+            "565": 302618112.0,
+            "566": 302618112.0,
+            "567": 302618112.0,
+            "568": 302618112.0,
+            "569": 302618112.0,
+            "570": 302618112.0,
+            "571": 302618112.0,
+            "572": 302618112.0,
+            "573": 302618112.0,
+            "574": 302618112.0,
+            "575": 302618112.0,
+            "576": 302618112.0,
+            "577": 302618112.0,
+            "578": 302618112.0,
+            "579": 302618112.0,
+            "580": 302618112.0,
+            "581": 302618112.0,
+            "582": 302618112.0,
+            "583": 302618112.0,
+            "584": 302618112.0,
+            "585": 302618112.0,
+            "586": 302618112.0,
+            "587": 302618112.0,
+            "588": 302618112.0,
+            "589": 302618112.0,
+            "590": 302618112.0,
+            "591": 302618112.0,
+            "592": 302618112.0,
+            "593": 302618112.0,
+            "594": 302618112.0,
+            "595": 302618112.0,
+            "596": 302618112.0,
+            "597": 302618112.0,
+            "598": 302618112.0,
+            "599": 302618112.0,
+            "600": 302618112.0,
+            "601": 302618112.0,
+            "602": 302618112.0,
+            "603": 302618112.0,
+            "604": 302618112.0,
+            "605": 302618112.0,
+            "606": 302618112.0,
+            "607": 302618112.0,
+            "608": 302618112.0,
+            "609": 302618112.0,
+            "610": 302618112.0,
+            "611": 302618112.0,
+            "612": 302618112.0,
+            "613": 302618112.0,
+            "614": 302618112.0,
+            "615": 302618112.0,
+            "616": 302618112.0,
+            "617": 302618112.0,
+            "618": 302618112.0,
+            "619": 302618112.0,
+            "620": 302618112.0,
+            "621": 302618112.0,
+            "622": 302618112.0,
+            "623": 302618112.0,
+            "624": 302618112.0,
+            "625": 302618112.0,
+            "626": 302618112.0,
+            "627": 302618112.0,
+            "628": 302618112.0,
+            "629": 302618112.0,
+            "630": 302618112.0,
+            "631": 302618112.0,
+            "632": 302618112.0,
+            "633": 302618112.0,
+            "634": 302618112.0,
+            "635": 302618112.0,
+            "636": 302618112.0,
+            "637": 302618112.0,
+            "638": 302618112.0,
+            "639": 302618112.0,
+            "640": 302618112.0,
+            "641": 302618112.0,
+            "642": 302618112.0,
+            "643": 302618112.0,
+            "644": 302618112.0,
+            "645": 302618112.0,
+            "646": 302618112.0,
+            "647": 302618112.0,
+            "648": 302618112.0,
+            "649": 302618112.0,
+            "650": 302618112.0,
+            "651": 302618112.0,
+            "652": 302618112.0,
+            "653": 302618112.0,
+            "654": 302618112.0,
+            "655": 302618112.0,
+            "656": 302618112.0,
+            "657": 302618112.0,
+            "658": 302618112.0,
+            "659": 302618112.0,
+            "660": 302618112.0,
+            "661": 302618112.0,
+            "662": 302618112.0,
+            "663": 302618112.0,
+            "664": 302618112.0,
+            "665": 302618112.0,
+            "666": 302618112.0,
+            "667": 302618112.0,
+            "668": 302618112.0,
+            "669": 302618112.0,
+            "670": 302618112.0,
+            "671": 302618112.0,
+            "672": 302618112.0,
+            "673": 302618112.0,
+            "674": 302618112.0,
+            "675": 302618112.0,
+            "676": 302618112.0,
+            "677": 302618112.0,
+            "678": 302618112.0,
+            "679": 302618112.0,
+            "680": 302618112.0,
+            "681": 302618112.0,
+            "682": 302618112.0,
+            "683": 302618112.0,
+            "684": 302618112.0,
+            "685": 302618112.0,
+            "686": 302618112.0,
+            "687": 302618112.0,
+            "688": 302618112.0,
+            "689": 302618112.0,
+            "690": 302618112.0,
+            "691": 302618112.0,
+            "692": 302618112.0,
+            "693": 302618112.0,
+            "694": 302618112.0,
+            "695": 302618112.0,
+            "696": 302618112.0,
+            "697": 302618112.0,
+            "698": 302618112.0,
+            "699": 302618112.0,
+            "700": 302618112.0,
+            "701": 302618112.0,
+            "702": 302618112.0,
+            "703": 302618112.0,
+            "704": 302618112.0,
+            "705": 302618112.0,
+            "706": 302618112.0,
+            "707": 302618112.0,
+            "708": 302618112.0,
+            "709": 302618112.0,
+            "710": 302618112.0,
+            "711": 302618112.0,
+            "712": 302618112.0,
+            "713": 302618112.0,
+            "714": 302618112.0,
+            "715": 302618112.0,
+            "716": 302618112.0,
+            "717": 302618112.0,
+            "718": 302618112.0,
+            "719": 302618112.0,
+            "720": 302618112.0,
+            "721": 302618112.0,
+            "722": 302618112.0,
+            "723": 302618112.0,
+            "724": 302618112.0,
+            "725": 302618112.0,
+            "726": 302618112.0,
+            "727": 302618112.0,
+            "728": 302618112.0,
+            "729": 302618112.0,
+            "730": 302618112.0,
+            "731": 302618112.0,
+            "732": 302618112.0,
+            "733": 302618112.0,
+            "734": 302618112.0,
+            "735": 302618112.0,
+            "736": 302618112.0,
+            "737": 302618112.0,
+            "738": 302618112.0,
+            "739": 302618112.0,
+            "740": 302618112.0,
+            "741": 302618112.0,
+            "742": 302618112.0,
+            "743": 302618112.0,
+            "744": 302618112.0,
+            "745": 302618112.0,
+            "746": 302618112.0,
+            "747": 302618112.0,
+            "748": 302618112.0,
+            "749": 302618112.0,
+            "750": 302618112.0,
+            "751": 302618112.0,
+            "752": 302618112.0,
+            "753": 302618112.0,
+            "754": 302618112.0,
+            "755": 302618112.0,
+            "756": 302618112.0,
+            "757": 302618112.0,
+            "758": 302618112.0,
+            "759": 302618112.0,
+            "760": 302618112.0,
+            "761": 302618112.0,
+            "762": 302618112.0,
+            "763": 302618112.0,
+            "764": 302618112.0,
+            "765": 302618112.0,
+            "766": 302618112.0,
+            "767": 302618112.0,
+            "768": 302618112.0,
+            "769": 302618112.0,
+            "770": 302618112.0,
+            "771": 302618112.0,
+            "772": 302618112.0,
+            "773": 302618112.0,
+            "774": 302618112.0,
+            "775": 302618112.0,
+            "776": 302618112.0,
+            "777": 302618112.0,
+            "778": 302618112.0,
+            "779": 302618112.0,
+            "780": 302618112.0,
+            "781": 302618112.0,
+            "782": 302618112.0,
+            "783": 302618112.0,
+            "784": 302618112.0,
+            "785": 302618112.0,
+            "786": 302618112.0,
+            "787": 302618112.0,
+            "788": 302618112.0,
+            "789": 302618112.0,
+            "790": 302618112.0,
+            "791": 302618112.0,
+            "792": 302618112.0,
+            "793": 302618112.0,
+            "794": 302618112.0,
+            "795": 302618112.0,
+            "796": 302618112.0,
+            "797": 302618112.0,
+            "798": 302618112.0,
+            "799": 302618112.0,
+            "800": 302618112.0,
+            "801": 302618112.0,
+            "802": 302618112.0,
+            "803": 302618112.0,
+            "804": 302618112.0,
+            "805": 302618112.0,
+            "806": 302618112.0,
+            "807": 302618112.0,
+            "808": 302618112.0,
+            "809": 302618112.0,
+            "810": 302618112.0,
+            "811": 302618112.0,
+            "812": 302618112.0,
+            "813": 302618112.0,
+            "814": 302618112.0,
+            "815": 302618112.0,
+            "816": 302618112.0,
+            "817": 302618112.0,
+            "818": 302618112.0,
+            "819": 302618112.0,
+            "820": 302618112.0,
+            "821": 302618112.0,
+            "822": 302618112.0,
+            "823": 302618112.0,
+            "824": 302618112.0,
+            "825": 302618112.0,
+            "826": 302618112.0,
+            "827": 302618112.0,
+            "828": 302618112.0,
+            "829": 302618112.0,
+            "830": 302618112.0,
+            "831": 302618112.0,
+            "832": 302618112.0,
+            "833": 302618112.0,
+            "834": 302618112.0,
+            "835": 302618112.0,
+            "836": 302618112.0,
+            "837": 302618112.0,
+            "838": 302618112.0,
+            "839": 302618112.0,
+            "840": 302618112.0,
+            "841": 302618112.0,
+            "842": 302618112.0,
+            "843": 302618112.0,
+            "844": 302618112.0,
+            "845": 302618112.0,
+            "846": 302618112.0,
+            "847": 302618112.0,
+            "848": 302618112.0,
+            "849": 302618112.0,
+            "850": 302618112.0,
+            "851": 302618112.0,
+            "852": 302618112.0,
+            "853": 302618112.0,
+            "854": 302618112.0,
+            "855": 302618112.0,
+            "856": 302618112.0,
+            "857": 302618112.0,
+            "858": 302618112.0,
+            "859": 302618112.0,
+            "860": 302618112.0,
+            "861": 302618112.0,
+            "862": 302618112.0,
+            "863": 302618112.0,
+            "864": 302618112.0,
+            "865": 302618112.0,
+            "866": 302618112.0,
+            "867": 302618112.0,
+            "868": 302618112.0,
+            "869": 302618112.0,
+            "870": 302618112.0,
+            "871": 302618112.0,
+            "872": 302618112.0,
+            "873": 302618112.0,
+            "874": 302618112.0,
+            "875": 302618112.0,
+            "876": 302618112.0,
+            "877": 302618112.0,
+            "878": 302618112.0,
+            "879": 302618112.0,
+            "880": 302618112.0,
+            "881": 302618112.0,
+            "882": 302618112.0,
+            "883": 302618112.0,
+            "884": 302618112.0,
+            "885": 302618112.0,
+            "886": 302618112.0,
+            "887": 302618112.0,
+            "888": 302618112.0,
+            "889": 302618112.0,
+            "890": 302618112.0,
+            "891": 302618112.0,
+            "892": 302618112.0,
+            "893": 302618112.0,
+            "894": 302618112.0,
+            "895": 302618112.0,
+            "896": 302618112.0,
+            "897": 302618112.0,
+            "898": 302618112.0,
+            "899": 302618112.0,
+            "900": 302618112.0,
+            "901": 302618112.0,
+            "902": 302618112.0,
+            "903": 302618112.0,
+            "904": 302618112.0,
+            "905": 302618112.0,
+            "906": 302618112.0,
+            "907": 302618112.0,
+            "908": 302618112.0,
+            "909": 302618112.0,
+            "910": 302618112.0,
+            "911": 302618112.0,
+            "912": 302618112.0,
+            "913": 302618112.0,
+            "914": 302618112.0,
+            "915": 302618112.0,
+            "916": 302618112.0,
+            "917": 302618112.0,
+            "918": 302618112.0,
+            "919": 302618112.0,
+            "920": 302618112.0,
+            "921": 302618112.0,
+            "922": 302618112.0,
+            "923": 302618112.0,
+            "924": 302618112.0,
+            "925": 302618112.0,
+            "926": 302618112.0,
+            "927": 302618112.0,
+            "928": 302618112.0,
+            "929": 302618112.0,
+            "930": 302618112.0,
+            "931": 302618112.0,
+            "932": 302618112.0,
+            "933": 302618112.0,
+            "934": 302618112.0,
+            "935": 302618112.0,
+            "936": 302618112.0,
+            "937": 302618112.0,
+            "938": 302618112.0,
+            "939": 302618112.0,
+            "940": 302618112.0,
+            "941": 302618112.0,
+            "942": 302618112.0,
+            "943": 302618112.0,
+            "944": 302618112.0,
+            "945": 302618112.0,
+            "946": 302618112.0,
+            "947": 302618112.0,
+            "948": 302618112.0,
+            "949": 302618112.0,
+            "950": 302618112.0,
+            "951": 302618112.0,
+            "952": 302618112.0,
+            "953": 302618112.0,
+            "954": 302618112.0,
+            "955": 302618112.0,
+            "956": 302618112.0,
+            "957": 302618112.0,
+            "958": 302618112.0,
+            "959": 302618112.0,
+            "960": 302618112.0,
+            "961": 302618112.0,
+            "962": 302618112.0,
+            "963": 302618112.0,
+            "964": 302618112.0,
+            "965": 302618112.0,
+            "966": 302618112.0,
+            "967": 302618112.0,
+            "968": 302618112.0,
+            "969": 302618112.0,
+            "970": 302618112.0,
+            "971": 302618112.0,
+            "972": 302618112.0,
+            "973": 302618112.0,
+            "974": 302618112.0,
+            "975": 302618112.0,
+            "976": 302618112.0,
+            "977": 302618112.0,
+            "978": 302618112.0,
+            "979": 302618112.0,
+            "980": 302618112.0,
+            "981": 302618112.0,
+            "982": 302618112.0,
+            "983": 302618112.0,
+            "984": 302618112.0,
+            "985": 302618112.0,
+            "986": 302618112.0,
+            "987": 302618112.0,
+            "988": 302618112.0,
+            "989": 302618112.0,
+            "990": 302618112.0,
+            "991": 302618112.0,
+            "992": 302618112.0,
+            "993": 302618112.0,
+            "994": 302618112.0,
+            "995": 302618112.0,
+            "996": 302618112.0,
+            "997": 302618112.0,
+            "998": 302618112.0,
+            "999": 302618112.0,
+            "1000": 302618112.0,
+            "1001": 302618112.0,
+            "1002": 302618112.0,
+            "1003": 302618112.0,
+            "1004": 302618112.0,
+            "1005": 302618112.0,
+            "1006": 302618112.0,
+            "1007": 302618112.0,
+            "1008": 302618112.0,
+            "1009": 302618112.0,
+            "1010": 302618112.0,
+            "1011": 302618112.0,
+            "1012": 302618112.0,
+            "1013": 302618112.0,
+            "1014": 302618112.0,
+            "1015": 302618112.0,
+            "1016": 302618112.0,
+            "1017": 302618112.0,
+            "1018": 302618112.0,
+            "1019": 302618112.0,
+            "1020": 302618112.0,
+            "1021": 302618112.0,
+            "1022": 302618112.0,
+            "1023": 302618112.0,
+            "1024": 302618112.0,
+            "1025": 302618112.0,
+            "1026": 302618112.0,
+            "1027": 302618112.0,
+            "1028": 302618112.0,
+            "1029": 302618112.0,
+            "1030": 302618112.0,
+            "1031": 302618112.0,
+            "1032": 302618112.0,
+            "1033": 302618112.0,
+            "1034": 302618112.0,
+            "1035": 302618112.0,
+            "1036": 302618112.0,
+            "1037": 302618112.0,
+            "1038": 302618112.0,
+            "1039": 302618112.0,
+            "1040": 302618112.0,
+            "1041": 302618112.0,
+            "1042": 302618112.0,
+            "1043": 302618112.0,
+            "1044": 302618112.0,
+            "1045": 302618112.0,
+            "1046": 302618112.0,
+            "1047": 302618112.0,
+            "1048": 302618112.0,
+            "1049": 302618112.0,
+            "1050": 302618112.0,
+            "1051": 302618112.0,
+            "1052": 302618112.0,
+            "1053": 302618112.0,
+            "1054": 302618112.0,
+            "1055": 302618112.0,
+            "1056": 302618112.0,
+            "1057": 302618112.0,
+            "1058": 302618112.0,
+            "1059": 302618112.0,
+            "1060": 302618112.0,
+            "1061": 302618112.0,
+            "1062": 302618112.0,
+            "1063": 302618112.0,
+            "1064": 302618112.0,
+            "1065": 302618112.0,
+            "1066": 302618112.0,
+            "1067": 302618112.0,
+            "1068": 302618112.0,
+            "1069": 302618112.0,
+            "1070": 302618112.0,
+            "1071": 302618112.0,
+            "1072": 302618112.0,
+            "1073": 302618112.0,
+            "1074": 302618112.0,
+            "1075": 302618112.0,
+            "1076": 302618112.0,
+            "1077": 302618112.0,
+            "1078": 302618112.0,
+            "1079": 302618112.0,
+            "1080": 302618112.0,
+            "1081": 302618112.0,
+            "1082": 302618112.0,
+            "1083": 302618112.0,
+            "1084": 302618112.0,
+            "1085": 302618112.0,
+            "1086": 302618112.0,
+            "1087": 302618112.0,
+            "1088": 302618112.0,
+            "1089": 302618112.0,
+            "1090": 302618112.0,
+            "1091": 302618112.0,
+            "1092": 302618112.0,
+            "1093": 302618112.0,
+            "1094": 302618112.0,
+            "1095": 302618112.0,
+            "1096": 302618112.0,
+            "1097": 302618112.0,
+            "1098": 302618112.0,
+            "1099": 302618112.0,
+            "1100": 302618112.0,
+            "1101": 302618112.0,
+            "1102": 302618112.0,
+            "1103": 302618112.0,
+            "1104": 302618112.0,
+            "1105": 302618112.0,
+            "1106": 302618112.0,
+            "1107": 302618112.0,
+            "1108": 302618112.0,
+            "1109": 302618112.0,
+            "1110": 302618112.0,
+            "1111": 302618112.0,
+            "1112": 302618112.0,
+            "1113": 302618112.0,
+            "1114": 302618112.0,
+            "1115": 302618112.0,
+            "1116": 302618112.0,
+            "1117": 302618112.0,
+            "1118": 302618112.0,
+            "1119": 302618112.0,
+            "1120": 302618112.0,
+            "1121": 302618112.0,
+            "1122": 302618112.0,
+            "1123": 302618112.0,
+            "1124": 302618112.0,
+            "1125": 302618112.0,
+            "1126": 302618112.0,
+            "1127": 302618112.0,
+            "1128": 302618112.0,
+            "1129": 302618112.0,
+            "1130": 302618112.0,
+            "1131": 302618112.0,
+            "1132": 302618112.0,
+            "1133": 302618112.0,
+            "1134": 302618112.0,
+            "1135": 302618112.0,
+            "1136": 302618112.0,
+            "1137": 302618112.0,
+            "1138": 302618112.0,
+            "1139": 302618112.0,
+            "1140": 302618112.0,
+            "1141": 302618112.0,
+            "1142": 302618112.0,
+            "1143": 302618112.0,
+            "1144": 302618112.0,
+            "1145": 302618112.0,
+            "1146": 302618112.0,
+            "1147": 302618112.0,
+            "1148": 302618112.0,
+            "1149": 302618112.0,
+            "1150": 302618112.0,
+            "1151": 302618112.0,
+            "1152": 302618112.0,
+            "1153": 302618112.0,
+            "1154": 302618112.0,
+            "1155": 302618112.0,
+            "1156": 302618112.0,
+            "1157": 302618112.0,
+            "1158": 302618112.0,
+            "1159": 302618112.0,
+            "1160": 302618112.0,
+            "1161": 302618112.0,
+            "1162": 302618112.0,
+            "1163": 302618112.0,
+            "1164": 302618112.0,
+            "1165": 302618112.0,
+            "1166": 302618112.0,
+            "1167": 302618112.0,
+            "1168": 302618112.0,
+            "1169": 302618112.0,
+            "1170": 302618112.0,
+            "1171": 302618112.0,
+            "1172": 302618112.0,
+            "1173": 302618112.0,
+            "1174": 302618112.0,
+            "1175": 302618112.0,
+            "1176": 302618112.0,
+            "1177": 302618112.0,
+            "1178": 302618112.0,
+            "1179": 302618112.0,
+            "1180": 302618112.0,
+            "1181": 302618112.0,
+            "1182": 302618112.0,
+            "1183": 302618112.0,
+            "1184": 302618112.0,
+            "1185": 302618112.0,
+            "1186": 302618112.0,
+            "1187": 302618112.0,
+            "1188": 302618112.0,
+            "1189": 302618112.0,
+            "1190": 302618112.0,
+            "1191": 302618112.0,
+            "1192": 302618112.0,
+            "1193": 302618112.0,
+            "1194": 302618112.0,
+            "1195": 302618112.0,
+            "1196": 302618112.0,
+            "1197": 302618112.0,
+            "1198": 302618112.0,
+            "1199": 302618112.0,
+            "1200": 302618112.0,
+            "1201": 302618112.0,
+            "1202": 302618112.0,
+            "1203": 302618112.0,
+            "1204": 302618112.0,
+            "1205": 302618112.0,
+            "1206": 302618112.0,
+            "1207": 302618112.0,
+            "1208": 302618112.0,
+            "1209": 302618112.0,
+            "1210": 302618112.0,
+            "1211": 302618112.0,
+            "1212": 302618112.0,
+            "1213": 302618112.0,
+            "1214": 302618112.0,
+            "1215": 302618112.0,
+            "1216": 302618112.0,
+            "1217": 302618112.0,
+            "1218": 302618112.0,
+            "1219": 302618112.0,
+            "1220": 302618112.0,
+            "1221": 302618112.0,
+            "1222": 302618112.0,
+            "1223": 302618112.0,
+            "1224": 302618112.0,
+            "1225": 302618112.0,
+            "1226": 302618112.0,
+            "1227": 302618112.0,
+            "1228": 302618112.0,
+            "1229": 302618112.0,
+            "1230": 302618112.0,
+            "1231": 302618112.0,
+            "1232": 302618112.0,
+            "1233": 302618112.0,
+            "1234": 302618112.0,
+            "1235": 302618112.0,
+            "1236": 302618112.0,
+            "1237": 302618112.0,
+            "1238": 302618112.0,
+            "1239": 302618112.0,
+            "1240": 302618112.0,
+            "1241": 302618112.0,
+            "1242": 302618112.0,
+            "1243": 302618112.0,
+            "1244": 302618112.0,
+            "1245": 302618112.0,
+            "1246": 302618112.0,
+            "1247": 302618112.0,
+            "1248": 302618112.0,
+            "1249": 302618112.0,
+            "1250": 302618112.0,
+            "1251": 302618112.0,
+            "1252": 302618112.0,
+            "1253": 302618112.0,
+            "1254": 302618112.0,
+            "1255": 302618112.0,
+            "1256": 302618112.0,
+            "1257": 302618112.0,
+            "1258": 302618112.0,
+            "1259": 302618112.0,
+            "1260": 302618112.0,
+            "1261": 302618112.0,
+            "1262": 302618112.0,
+            "1263": 302618112.0,
+            "1264": 302618112.0,
+            "1265": 302618112.0,
+            "1266": 302618112.0,
+            "1267": 302618112.0,
+            "1268": 302618112.0,
+            "1269": 302618112.0,
+            "1270": 302618112.0,
+            "1271": 302618112.0,
+            "1272": 302618112.0,
+            "1273": 302618112.0,
+            "1274": 302618112.0,
+            "1275": 302618112.0,
+            "1276": 302618112.0,
+            "1277": 302618112.0,
+            "1278": 302618112.0,
+            "1279": 302618112.0,
+            "1280": 302618112.0,
+            "1281": 302618112.0,
+            "1282": 302618112.0,
+            "1283": 302618112.0,
+            "1284": 302618112.0,
+            "1285": 302618112.0,
+            "1286": 302618112.0,
+            "1287": 302618112.0,
+            "1288": 302618112.0,
+            "1289": 302618112.0,
+            "1290": 302618112.0,
+            "1291": 302618112.0,
+            "1292": 302618112.0,
+            "1293": 302618112.0,
+            "1294": 302618112.0,
+            "1295": 302618112.0,
+            "1296": 302618112.0,
+            "1297": 302618112.0,
+            "1298": 302618112.0,
+            "1299": 302618112.0,
+            "1300": 302618112.0,
+            "1301": 302618112.0,
+            "1302": 302618112.0,
+            "1303": 302618112.0,
+            "1304": 302618112.0,
+            "1305": 302618112.0,
+            "1306": 302618112.0,
+            "1307": 302618112.0,
+            "1308": 302618112.0,
+            "1309": 302618112.0,
+            "1310": 302618112.0,
+            "1311": 302618112.0,
+            "1312": 302618112.0,
+            "1313": 302618112.0,
+            "1314": 302618112.0,
+            "1315": 302618112.0,
+            "1316": 302618112.0,
+            "1317": 302618112.0,
+            "1318": 302618112.0,
+            "1319": 302618112.0,
+            "1320": 302618112.0,
+            "1321": 302618112.0,
+            "1322": 302618112.0,
+            "1323": 302618112.0,
+            "1324": 302618112.0,
+            "1325": 302618112.0,
+            "1326": 302618112.0,
+            "1327": 302618112.0,
+            "1328": 302618112.0,
+            "1329": 302618112.0,
+            "1330": 302618112.0,
+            "1331": 302618112.0,
+            "1332": 302618112.0,
+            "1333": 302618112.0,
+            "1334": 302618112.0,
+            "1335": 302618112.0,
+            "1336": 302618112.0,
+            "1337": 302618112.0,
+            "1338": 302618112.0,
+            "1339": 302618112.0,
+            "1340": 302618112.0,
+            "1341": 302618112.0,
+            "1342": 302618112.0,
+            "1343": 302618112.0,
+            "1344": 302618112.0,
+            "1345": 302618112.0,
+            "1346": 302618112.0,
+            "1347": 302618112.0,
+            "1348": 302618112.0,
+            "1349": 302618112.0,
+            "1350": 302618112.0,
+            "1351": 302618112.0,
+            "1352": 302618112.0,
+            "1353": 302618112.0,
+            "1354": 302618112.0,
+            "1355": 302618112.0,
+            "1356": 302618112.0,
+            "1357": 302618112.0,
+            "1358": 302618112.0,
+            "1359": 302618112.0,
+            "1360": 302618112.0,
+            "1361": 302618112.0,
+            "1362": 302618112.0,
+            "1363": 302618112.0,
+            "1364": 302618112.0,
+            "1365": 302618112.0,
+            "1366": 302618112.0,
+            "1367": 302618112.0,
+            "1368": 302618112.0,
+            "1369": 302618112.0,
+            "1370": 302618112.0,
+            "1371": 302618112.0,
+            "1372": 302618112.0,
+            "1373": 302618112.0,
+            "1374": 302618112.0,
+            "1375": 302618112.0,
+            "1376": 302618112.0,
+            "1377": 302618112.0,
+            "1378": 302618112.0,
+            "1379": 302618112.0,
+            "1380": 302618112.0,
+            "1381": 302618112.0,
+            "1382": 302618112.0,
+            "1383": 302618112.0,
+            "1384": 302618112.0,
+            "1385": 302618112.0,
+            "1386": 302618112.0,
+            "1387": 302618112.0,
+            "1388": 302618112.0,
+            "1389": 302618112.0,
+            "1390": 302618112.0,
+            "1391": 302618112.0,
+            "1392": 302618112.0,
+            "1393": 302618112.0,
+            "1394": 302618112.0,
+            "1395": 302618112.0,
+            "1396": 302618112.0,
+            "1397": 302618112.0,
+            "1398": 302618112.0,
+            "1399": 302618112.0,
+            "1400": 302618112.0,
+            "1401": 302618112.0,
+            "1402": 302618112.0,
+            "1403": 302618112.0,
+            "1404": 302618112.0,
+            "1405": 302618112.0,
+            "1406": 302618112.0,
+            "1407": 302618112.0,
+            "1408": 302618112.0,
+            "1409": 302618112.0,
+            "1410": 302618112.0,
+            "1411": 302618112.0,
+            "1412": 302618112.0,
+            "1413": 302618112.0,
+            "1414": 302618112.0,
+            "1415": 302618112.0,
+            "1416": 302618112.0,
+            "1417": 302618112.0,
+            "1418": 302618112.0,
+            "1419": 302618112.0,
+            "1420": 302618112.0,
+            "1421": 302618112.0,
+            "1422": 302618112.0,
+            "1423": 302618112.0,
+            "1424": 302618112.0,
+            "1425": 302618112.0,
+            "1426": 302618112.0,
+            "1427": 302618112.0,
+            "1428": 302618112.0,
+            "1429": 302618112.0,
+            "1430": 302618112.0,
+            "1431": 302618112.0,
+            "1432": 302618112.0,
+            "1433": 302618112.0,
+            "1434": 302618112.0,
+            "1435": 302618112.0,
+            "1436": 302618112.0,
+            "1437": 302618112.0,
+            "1438": 302618112.0,
+            "1439": 302618112.0,
+            "1440": 302618112.0,
+            "1441": 302618112.0,
+            "1442": 302618112.0,
+            "1443": 302618112.0,
+            "1444": 302618112.0,
+            "1445": 302618112.0,
+            "1446": 302618112.0,
+            "1447": 302618112.0,
+            "1448": 302618112.0,
+            "1449": 302618112.0,
+            "1450": 302618112.0,
+            "1451": 302618112.0,
+            "1452": 302618112.0,
+            "1453": 302618112.0,
+            "1454": 302618112.0,
+            "1455": 302618112.0,
+            "1456": 302618112.0,
+            "1457": 302618112.0,
+            "1458": 302618112.0,
+            "1459": 302618112.0,
+            "1460": 302618112.0,
+            "1461": 302618112.0,
+            "1462": 302618112.0,
+            "1463": 302618112.0,
+            "1464": 302618112.0,
+            "1465": 302618112.0,
+            "1466": 302618112.0,
+            "1467": 302618112.0,
+            "1468": 302618112.0,
+            "1469": 302618112.0,
+            "1470": 302618112.0,
+            "1471": 302618112.0,
+            "1472": 302618112.0,
+            "1473": 302618112.0,
+            "1474": 302618112.0,
+            "1475": 302618112.0,
+            "1476": 302618112.0,
+            "1477": 302618112.0,
+            "1478": 302618112.0,
+            "1479": 302618112.0,
+            "1480": 302618112.0,
+            "1481": 302618112.0,
+            "1482": 302618112.0,
+            "1483": 302618112.0,
+            "1484": 302618112.0,
+            "1485": 302618112.0,
+            "1486": 302618112.0,
+            "1487": 302618112.0,
+            "1488": 302618112.0,
+            "1489": 302618112.0,
+            "1490": 302618112.0,
+            "1491": 302618112.0,
+            "1492": 302618112.0,
+            "1493": 302618112.0,
+            "1494": 302618112.0,
+            "1495": 302618112.0,
+            "1496": 302618112.0,
+            "1497": 302618112.0,
+            "1498": 302618112.0,
+            "1499": 302618112.0,
+            "1500": 302618112.0,
+            "1501": 302618112.0,
+            "1502": 302618112.0,
+            "1503": 302618112.0,
+            "1504": 302618112.0,
+            "1505": 302618112.0,
+            "1506": 302618112.0,
+            "1507": 302618112.0,
+            "1508": 302618112.0,
+            "1509": 302618112.0,
+            "1510": 302618112.0,
+            "1511": 302618112.0,
+            "1512": 302618112.0,
+            "1513": 302618112.0,
+            "1514": 302618112.0,
+            "1515": 302618112.0,
+            "1516": 302618112.0,
+            "1517": 302618112.0,
+            "1518": 302618112.0,
+            "1519": 302618112.0,
+            "1520": 302618112.0,
+            "1521": 302618112.0,
+            "1522": 302618112.0,
+            "1523": 302618112.0,
+            "1524": 302618112.0,
+            "1525": 302618112.0,
+            "1526": 302618112.0,
+            "1527": 302618112.0,
+            "1528": 302618112.0,
+            "1529": 302618112.0,
+            "1530": 302618112.0,
+            "1531": 302618112.0,
+            "1532": 302618112.0,
+            "1533": 302618112.0,
+            "1534": 302618112.0,
+            "1535": 302618112.0,
+            "1536": 302618112.0,
+            "1537": 302618112.0,
+            "1538": 302618112.0,
+            "1539": 302618112.0,
+            "1540": 302618112.0,
+            "1541": 302618112.0,
+            "1542": 302618112.0,
+            "1543": 302618112.0,
+            "1544": 302618112.0,
+            "1545": 302618112.0,
+            "1546": 302618112.0,
+            "1547": 302618112.0,
+            "1548": 302618112.0,
+            "1549": 302618112.0,
+            "1550": 302618112.0,
+            "1551": 302618112.0,
+            "1552": 302618112.0,
+            "1553": 302618112.0,
+            "1554": 302618112.0,
+            "1555": 302618112.0,
+            "1556": 302618112.0,
+            "1557": 302618112.0,
+            "1558": 302618112.0,
+            "1559": 302618112.0,
+            "1560": 302618112.0,
+            "1561": 302618112.0,
+            "1562": 302618112.0,
+            "1563": 302618112.0,
+            "1564": 302618112.0,
+            "1565": 302618112.0,
+            "1566": 302618112.0,
+            "1567": 302618112.0,
+            "1568": 302618112.0,
+            "1569": 302618112.0,
+            "1570": 302618112.0,
+            "1571": 302618112.0,
+            "1572": 302618112.0,
+            "1573": 302618112.0,
+            "1574": 302618112.0,
+            "1575": 302618112.0,
+            "1576": 302618112.0,
+            "1577": 302618112.0,
+            "1578": 302618112.0,
+            "1579": 302618112.0,
+            "1580": 302618112.0,
+            "1581": 302618112.0,
+            "1582": 302618112.0,
+            "1583": 302618112.0,
+            "1584": 302618112.0,
+            "1585": 302618112.0,
+            "1586": 302618112.0,
+            "1587": 302618112.0,
+            "1588": 302618112.0,
+            "1589": 302618112.0,
+            "1590": 302618112.0,
+            "1591": 302618112.0,
+            "1592": 302618112.0,
+            "1593": 302618112.0,
+            "1594": 302618112.0,
+            "1595": 302618112.0,
+            "1596": 302618112.0,
+            "1597": 302618112.0,
+            "1598": 302618112.0,
+            "1599": 302618112.0,
+            "1600": 302618112.0,
+            "1601": 302618112.0,
+            "1602": 302618112.0,
+            "1603": 302618112.0,
+            "1604": 302618112.0,
+            "1605": 302618112.0,
+            "1606": 302618112.0,
+            "1607": 302618112.0,
+            "1608": 302618112.0,
+            "1609": 302618112.0,
+            "1610": 302618112.0,
+            "1611": 302618112.0,
+            "1612": 302618112.0,
+            "1613": 302618112.0,
+            "1614": 302618112.0,
+            "1615": 302618112.0,
+            "1616": 302618112.0,
+            "1617": 302618112.0,
+            "1618": 302618112.0,
+            "1619": 302618112.0,
+            "1620": 302618112.0,
+            "1621": 302618112.0,
+            "1622": 302618112.0,
+            "1623": 302618112.0,
+            "1624": 302618112.0,
+            "1625": 302618112.0,
+            "1626": 302618112.0,
+            "1627": 302618112.0,
+            "1628": 302618112.0,
+            "1629": 302618112.0,
+            "1630": 302618112.0,
+            "1631": 302618112.0,
+            "1632": 302618112.0,
+            "1633": 302618112.0,
+            "1634": 302618112.0,
+            "1635": 302618112.0,
+            "1636": 302618112.0,
+            "1637": 302618112.0,
+            "1638": 302618112.0,
+            "1639": 302618112.0,
+            "1640": 302618112.0,
+            "1641": 302618112.0,
+            "1642": 302618112.0,
+            "1643": 302618112.0,
+            "1644": 302618112.0,
+            "1645": 302618112.0,
+            "1646": 302618112.0,
+            "1647": 302618112.0,
+            "1648": 302618112.0,
+            "1649": 302618112.0,
+            "1650": 302618112.0,
+            "1651": 302618112.0,
+            "1652": 302618112.0,
+            "1653": 302618112.0,
+            "1654": 302618112.0,
+            "1655": 302618112.0,
+            "1656": 302618112.0,
+            "1657": 302618112.0,
+            "1658": 302618112.0,
+            "1659": 302618112.0,
+            "1660": 302618112.0,
+            "1661": 302618112.0,
+            "1662": 302618112.0,
+            "1663": 302618112.0,
+            "1664": 302618112.0,
+            "1665": 302618112.0,
+            "1666": 302618112.0,
+            "1667": 302618112.0,
+            "1668": 302618112.0,
+            "1669": 302618112.0,
+            "1670": 302618112.0,
+            "1671": 302618112.0,
+            "1672": 302618112.0,
+            "1673": 302618112.0,
+            "1674": 302618112.0,
+            "1675": 302618112.0,
+            "1676": 302618112.0,
+            "1677": 302618112.0,
+            "1678": 302618112.0,
+            "1679": 302618112.0,
+            "1680": 302618112.0,
+            "1681": 302618112.0,
+            "1682": 302618112.0,
+            "1683": 302618112.0,
+            "1684": 302618112.0,
+            "1685": 302618112.0,
+            "1686": 302618112.0,
+            "1687": 302618112.0,
+            "1688": 302618112.0,
+            "1689": 302618112.0,
+            "1690": 302618112.0,
+            "1691": 302618112.0,
+            "1692": 302618112.0,
+            "1693": 302618112.0,
+            "1694": 302618112.0,
+            "1695": 302618112.0,
+            "1696": 302618112.0,
+            "1697": 302618112.0,
+            "1698": 302618112.0,
+            "1699": 302618112.0,
+            "1700": 302618112.0,
+            "1701": 302618112.0,
+            "1702": 302618112.0,
+            "1703": 302618112.0,
+            "1704": 302618112.0,
+            "1705": 302618112.0,
+            "1706": 302618112.0,
+            "1707": 302618112.0,
+            "1708": 302618112.0,
+            "1709": 302618112.0,
+            "1710": 302618112.0,
+            "1711": 302618112.0,
+            "1712": 302618112.0,
+            "1713": 302618112.0,
+            "1714": 302618112.0,
+            "1715": 302618112.0,
+            "1716": 302618112.0,
+            "1717": 302618112.0,
+            "1718": 302618112.0,
+            "1719": 302618112.0,
+            "1720": 302618112.0,
+            "1721": 302618112.0,
+            "1722": 302618112.0,
+            "1723": 302618112.0,
+            "1724": 302618112.0,
+            "1725": 302618112.0,
+            "1726": 302618112.0,
+            "1727": 302618112.0,
+            "1728": 302618112.0,
+            "1729": 302618112.0,
+            "1730": 302618112.0,
+            "1731": 302618112.0,
+            "1732": 302618112.0,
+            "1733": 302618112.0,
+            "1734": 302618112.0,
+            "1735": 302618112.0,
+            "1736": 302618112.0,
+            "1737": 302618112.0,
+            "1738": 302618112.0,
+            "1739": 302618112.0,
+            "1740": 302618112.0,
+            "1741": 302618112.0,
+            "1742": 302618112.0,
+            "1743": 302618112.0,
+            "1744": 302618112.0,
+            "1745": 302618112.0,
+            "1746": 302618112.0,
+            "1747": 302618112.0,
+            "1748": 302618112.0,
+            "1749": 302618112.0,
+            "1750": 302618112.0,
+            "1751": 302618112.0,
+            "1752": 302618112.0,
+            "1753": 302618112.0,
+            "1754": 302618112.0,
+            "1755": 302618112.0,
+            "1756": 302618112.0,
+            "1757": 302618112.0,
+            "1758": 302618112.0,
+            "1759": 302618112.0,
+            "1760": 302618112.0,
+            "1761": 302618112.0,
+            "1762": 302618112.0,
+            "1763": 302618112.0,
+            "1764": 302618112.0,
+            "1765": 302618112.0,
+            "1766": 302618112.0,
+            "1767": 302618112.0,
+            "1768": 302618112.0,
+            "1769": 302618112.0,
+            "1770": 302618112.0,
+            "1771": 302618112.0,
+            "1772": 302618112.0,
+            "1773": 302618112.0,
+            "1774": 302618112.0,
+            "1775": 302618112.0,
+            "1776": 302618112.0,
+            "1777": 302618112.0,
+            "1778": 302618112.0,
+            "1779": 302618112.0,
+            "1780": 302618112.0,
+            "1781": 302618112.0,
+            "1782": 302618112.0,
+            "1783": 302618112.0,
+            "1784": 302618112.0,
+            "1785": 302618112.0,
+            "1786": 302618112.0,
+            "1787": 302618112.0,
+            "1788": 302618112.0,
+            "1789": 302618112.0,
+            "1790": 302618112.0,
+            "1791": 302618112.0,
+            "1792": 302618112.0,
+            "1793": 302618112.0,
+            "1794": 302618112.0,
+            "1795": 302618112.0,
+            "1796": 302618112.0,
+            "1797": 302618112.0,
+            "1798": 302618112.0,
+            "1799": 302618112.0,
+            "1800": 302618112.0,
+            "1801": 302618112.0,
+            "1802": 302618112.0,
+            "1803": 302618112.0,
+            "1804": 302618112.0,
+            "1805": 302618112.0,
+            "1806": 302618112.0,
+            "1807": 302618112.0,
+            "1808": 302618112.0,
+            "1809": 302618112.0,
+            "1810": 302618112.0,
+            "1811": 302618112.0,
+            "1812": 302618112.0,
+            "1813": 302618112.0,
+            "1814": 302618112.0,
+            "1815": 302618112.0,
+            "1816": 302618112.0,
+            "1817": 302618112.0,
+            "1818": 302618112.0,
+            "1819": 302618112.0,
+            "1820": 302618112.0,
+            "1821": 302618112.0,
+            "1822": 302618112.0,
+            "1823": 302618112.0,
+            "1824": 302618112.0,
+            "1825": 302618112.0,
+            "1826": 302618112.0,
+            "1827": 302618112.0,
+            "1828": 302618112.0,
+            "1829": 302618112.0,
+            "1830": 302618112.0,
+            "1831": 302618112.0,
+            "1832": 302618112.0,
+            "1833": 302618112.0,
+            "1834": 302618112.0,
+            "1835": 302618112.0,
+            "1836": 302618112.0,
+            "1837": 302618112.0,
+            "1838": 302618112.0,
+            "1839": 302618112.0,
+            "1840": 302618112.0,
+            "1841": 302618112.0,
+            "1842": 302618112.0,
+            "1843": 302618112.0,
+            "1844": 302618112.0,
+            "1845": 302618112.0,
+            "1846": 302618112.0,
+            "1847": 302618112.0,
+            "1848": 302618112.0,
+            "1849": 302618112.0,
+            "1850": 302618112.0,
+            "1851": 302618112.0,
+            "1852": 302618112.0,
+            "1853": 302618112.0,
+            "1854": 302618112.0,
+            "1855": 302618112.0,
+            "1856": 302618112.0,
+            "1857": 302618112.0,
+            "1858": 302618112.0,
+            "1859": 302618112.0,
+            "1860": 302618112.0,
+            "1861": 302618112.0,
+            "1862": 302618112.0,
+            "1863": 302618112.0,
+            "1864": 302618112.0,
+            "1865": 302618112.0,
+            "1866": 302618112.0,
+            "1867": 302618112.0,
+            "1868": 302618112.0,
+            "1869": 302618112.0,
+            "1870": 302618112.0,
+            "1871": 302618112.0,
+            "1872": 302618112.0,
+            "1873": 302618112.0,
+            "1874": 302618112.0,
+            "1875": 302618112.0,
+            "1876": 302618112.0,
+            "1877": 302618112.0,
+            "1878": 302618112.0,
+            "1879": 302618112.0,
+            "1880": 302618112.0,
+            "1881": 302618112.0,
+            "1882": 302618112.0,
+            "1883": 302618112.0,
+            "1884": 302618112.0,
+            "1885": 302618112.0,
+            "1886": 302618112.0,
+            "1887": 302618112.0,
+            "1888": 302618112.0,
+            "1889": 302618112.0,
+            "1890": 302618112.0,
+            "1891": 302618112.0,
+            "1892": 302618112.0,
+            "1893": 302618112.0,
+            "1894": 302618112.0,
+            "1895": 302618112.0,
+            "1896": 302618112.0,
+            "1897": 302618112.0,
+            "1898": 302618112.0,
+            "1899": 302618112.0,
+            "1900": 302618112.0,
+            "1901": 302618112.0,
+            "1902": 302618112.0,
+            "1903": 302618112.0,
+            "1904": 302618112.0,
+            "1905": 302618112.0,
+            "1906": 302618112.0,
+            "1907": 302618112.0,
+            "1908": 302618112.0,
+            "1909": 302618112.0,
+            "1910": 302618112.0,
+            "1911": 302618112.0,
+            "1912": 302618112.0,
+            "1913": 302618112.0,
+            "1914": 302618112.0,
+            "1915": 302618112.0,
+            "1916": 302618112.0,
+            "1917": 302618112.0,
+            "1918": 302618112.0,
+            "1919": 302618112.0,
+            "1920": 302618112.0,
+            "1921": 302618112.0,
+            "1922": 302618112.0,
+            "1923": 302618112.0,
+            "1924": 302618112.0,
+            "1925": 302618112.0,
+            "1926": 302618112.0,
+            "1927": 302618112.0,
+            "1928": 302618112.0,
+            "1929": 302618112.0,
+            "1930": 302618112.0,
+            "1931": 302618112.0,
+            "1932": 302618112.0,
+            "1933": 302618112.0,
+            "1934": 302618112.0,
+            "1935": 302618112.0,
+            "1936": 302618112.0,
+            "1937": 302618112.0,
+            "1938": 302618112.0,
+            "1939": 302618112.0,
+            "1940": 302618112.0,
+            "1941": 302618112.0,
+            "1942": 302618112.0,
+            "1943": 302618112.0,
+            "1944": 302618112.0,
+            "1945": 302618112.0,
+            "1946": 302618112.0,
+            "1947": 302618112.0,
+            "1948": 302618112.0,
+            "1949": 302618112.0,
+            "1950": 302618112.0,
+            "1951": 302618112.0,
+            "1952": 302618112.0,
+            "1953": 302618112.0,
+            "1954": 302618112.0,
+            "1955": 302618112.0,
+            "1956": 302618112.0,
+            "1957": 302618112.0,
+            "1958": 302618112.0,
+            "1959": 302618112.0,
+            "1960": 302618112.0,
+            "1961": 302618112.0,
+            "1962": 302618112.0,
+            "1963": 302618112.0,
+            "1964": 302618112.0,
+            "1965": 302618112.0,
+            "1966": 302618112.0,
+            "1967": 302618112.0,
+            "1968": 302618112.0,
+            "1969": 302618112.0,
+            "1970": 302618112.0,
+            "1971": 302618112.0,
+            "1972": 302618112.0,
+            "1973": 302618112.0,
+            "1974": 302618112.0,
+            "1975": 302618112.0,
+            "1976": 302618112.0,
+            "1977": 302618112.0,
+            "1978": 302618112.0,
+            "1979": 302618112.0,
+            "1980": 302618112.0,
+            "1981": 302618112.0,
+            "1982": 302618112.0,
+            "1983": 302618112.0,
+            "1984": 302618112.0,
+            "1985": 302618112.0,
+            "1986": 302618112.0,
+            "1987": 302618112.0,
+            "1988": 302618112.0,
+            "1989": 302618112.0,
+            "1990": 302618112.0,
+            "1991": 302618112.0,
+            "1992": 302618112.0,
+            "1993": 302618112.0,
+            "1994": 302618112.0,
+            "1995": 302618112.0,
+            "1996": 302618112.0,
+            "1997": 302618112.0,
+            "1998": 302618112.0,
+            "1999": 302618112.0,
+            "2000": 302618112.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 362060288.0,
+            "2": 428612096.0,
+            "3": 428612096.0,
+            "4": 428612096.0,
+            "5": 428612096.0,
+            "6": 428612096.0,
+            "7": 428612096.0,
+            "8": 428612096.0,
+            "9": 428612096.0,
+            "10": 428612096.0,
+            "11": 428612096.0,
+            "12": 428612096.0,
+            "13": 428612096.0,
+            "14": 428612096.0,
+            "15": 428612096.0,
+            "16": 428612096.0,
+            "17": 428612096.0,
+            "18": 428612096.0,
+            "19": 428612096.0,
+            "20": 428612096.0,
+            "21": 428612096.0,
+            "22": 428612096.0,
+            "23": 428612096.0,
+            "24": 428612096.0,
+            "25": 428612096.0,
+            "26": 428612096.0,
+            "27": 428612096.0,
+            "28": 428612096.0,
+            "29": 428612096.0,
+            "30": 428612096.0,
+            "31": 428612096.0,
+            "32": 428612096.0,
+            "33": 428612096.0,
+            "34": 428612096.0,
+            "35": 428612096.0,
+            "36": 428612096.0,
+            "37": 428612096.0,
+            "38": 428612096.0,
+            "39": 428612096.0,
+            "40": 428612096.0,
+            "41": 428612096.0,
+            "42": 428612096.0,
+            "43": 428612096.0,
+            "44": 428612096.0,
+            "45": 428612096.0,
+            "46": 428612096.0,
+            "47": 428612096.0,
+            "48": 428612096.0,
+            "49": 428612096.0,
+            "50": 428612096.0,
+            "51": 428612096.0,
+            "52": 428612096.0,
+            "53": 428612096.0,
+            "54": 428612096.0,
+            "55": 428612096.0,
+            "56": 428612096.0,
+            "57": 428612096.0,
+            "58": 428612096.0,
+            "59": 428612096.0,
+            "60": 428612096.0,
+            "61": 428612096.0,
+            "62": 428612096.0,
+            "63": 428612096.0,
+            "64": 428612096.0,
+            "65": 428612096.0,
+            "66": 428612096.0,
+            "67": 428612096.0,
+            "68": 428612096.0,
+            "69": 428612096.0,
+            "70": 428612096.0,
+            "71": 428612096.0,
+            "72": 428612096.0,
+            "73": 428612096.0,
+            "74": 428612096.0,
+            "75": 428612096.0,
+            "76": 428612096.0,
+            "77": 428612096.0,
+            "78": 428612096.0,
+            "79": 428612096.0,
+            "80": 428612096.0,
+            "81": 428612096.0,
+            "82": 428612096.0,
+            "83": 428612096.0,
+            "84": 428612096.0,
+            "85": 428612096.0,
+            "86": 428612096.0,
+            "87": 428612096.0,
+            "88": 428612096.0,
+            "89": 428612096.0,
+            "90": 428612096.0,
+            "91": 428612096.0,
+            "92": 428612096.0,
+            "93": 428612096.0,
+            "94": 428612096.0,
+            "95": 428612096.0,
+            "96": 428612096.0,
+            "97": 428612096.0,
+            "98": 428612096.0,
+            "99": 428612096.0,
+            "100": 428612096.0,
+            "101": 428612096.0,
+            "102": 428612096.0,
+            "103": 428612096.0,
+            "104": 428612096.0,
+            "105": 428612096.0,
+            "106": 428612096.0,
+            "107": 428612096.0,
+            "108": 428612096.0,
+            "109": 428612096.0,
+            "110": 428612096.0,
+            "111": 428612096.0,
+            "112": 428612096.0,
+            "113": 428612096.0,
+            "114": 428612096.0,
+            "115": 428612096.0,
+            "116": 428612096.0,
+            "117": 428612096.0,
+            "118": 428612096.0,
+            "119": 428612096.0,
+            "120": 428612096.0,
+            "121": 428612096.0,
+            "122": 428612096.0,
+            "123": 428612096.0,
+            "124": 428612096.0,
+            "125": 428612096.0,
+            "126": 428612096.0,
+            "127": 428612096.0,
+            "128": 428612096.0,
+            "129": 428612096.0,
+            "130": 428612096.0,
+            "131": 428612096.0,
+            "132": 428612096.0,
+            "133": 428612096.0,
+            "134": 428612096.0,
+            "135": 428612096.0,
+            "136": 428612096.0,
+            "137": 428612096.0,
+            "138": 428612096.0,
+            "139": 428612096.0,
+            "140": 428612096.0,
+            "141": 428612096.0,
+            "142": 428612096.0,
+            "143": 428612096.0,
+            "144": 428612096.0,
+            "145": 428612096.0,
+            "146": 428612096.0,
+            "147": 428612096.0,
+            "148": 428612096.0,
+            "149": 428612096.0,
+            "150": 428612096.0,
+            "151": 428612096.0,
+            "152": 428612096.0,
+            "153": 428612096.0,
+            "154": 428612096.0,
+            "155": 428612096.0,
+            "156": 428612096.0,
+            "157": 428612096.0,
+            "158": 428612096.0,
+            "159": 428612096.0,
+            "160": 428612096.0,
+            "161": 428612096.0,
+            "162": 428612096.0,
+            "163": 428612096.0,
+            "164": 428612096.0,
+            "165": 428612096.0,
+            "166": 428612096.0,
+            "167": 428612096.0,
+            "168": 428612096.0,
+            "169": 428612096.0,
+            "170": 428612096.0,
+            "171": 428612096.0,
+            "172": 428612096.0,
+            "173": 428612096.0,
+            "174": 428612096.0,
+            "175": 428612096.0,
+            "176": 428612096.0,
+            "177": 428612096.0,
+            "178": 428612096.0,
+            "179": 428612096.0,
+            "180": 428612096.0,
+            "181": 428612096.0,
+            "182": 428612096.0,
+            "183": 428612096.0,
+            "184": 428612096.0,
+            "185": 428612096.0,
+            "186": 428612096.0,
+            "187": 428612096.0,
+            "188": 428612096.0,
+            "189": 428612096.0,
+            "190": 428612096.0,
+            "191": 428612096.0,
+            "192": 428612096.0,
+            "193": 428612096.0,
+            "194": 428612096.0,
+            "195": 428612096.0,
+            "196": 428612096.0,
+            "197": 428612096.0,
+            "198": 428612096.0,
+            "199": 428612096.0,
+            "200": 428612096.0,
+            "201": 428612096.0,
+            "202": 428612096.0,
+            "203": 428612096.0,
+            "204": 428612096.0,
+            "205": 428612096.0,
+            "206": 428612096.0,
+            "207": 428612096.0,
+            "208": 428612096.0,
+            "209": 428612096.0,
+            "210": 428612096.0,
+            "211": 428612096.0,
+            "212": 428612096.0,
+            "213": 428612096.0,
+            "214": 428612096.0,
+            "215": 428612096.0,
+            "216": 428612096.0,
+            "217": 428612096.0,
+            "218": 428612096.0,
+            "219": 428612096.0,
+            "220": 428612096.0,
+            "221": 428612096.0,
+            "222": 428612096.0,
+            "223": 428612096.0,
+            "224": 428612096.0,
+            "225": 428612096.0,
+            "226": 428612096.0,
+            "227": 428612096.0,
+            "228": 428612096.0,
+            "229": 428612096.0,
+            "230": 428612096.0,
+            "231": 428612096.0,
+            "232": 428612096.0,
+            "233": 428612096.0,
+            "234": 428612096.0,
+            "235": 428612096.0,
+            "236": 428612096.0,
+            "237": 428612096.0,
+            "238": 428612096.0,
+            "239": 428612096.0,
+            "240": 428612096.0,
+            "241": 428612096.0,
+            "242": 428612096.0,
+            "243": 428612096.0,
+            "244": 428612096.0,
+            "245": 428612096.0,
+            "246": 428612096.0,
+            "247": 428612096.0,
+            "248": 428612096.0,
+            "249": 428612096.0,
+            "250": 428612096.0,
+            "251": 428612096.0,
+            "252": 428612096.0,
+            "253": 428612096.0,
+            "254": 428612096.0,
+            "255": 428612096.0,
+            "256": 428612096.0,
+            "257": 428612096.0,
+            "258": 428612096.0,
+            "259": 428612096.0,
+            "260": 428612096.0,
+            "261": 428612096.0,
+            "262": 428612096.0,
+            "263": 428612096.0,
+            "264": 428612096.0,
+            "265": 428612096.0,
+            "266": 428612096.0,
+            "267": 428612096.0,
+            "268": 428612096.0,
+            "269": 428612096.0,
+            "270": 428612096.0,
+            "271": 428612096.0,
+            "272": 428612096.0,
+            "273": 428612096.0,
+            "274": 428612096.0,
+            "275": 428612096.0,
+            "276": 428612096.0,
+            "277": 428612096.0,
+            "278": 428612096.0,
+            "279": 428612096.0,
+            "280": 428612096.0,
+            "281": 428612096.0,
+            "282": 428612096.0,
+            "283": 428612096.0,
+            "284": 428612096.0,
+            "285": 428612096.0,
+            "286": 428612096.0,
+            "287": 428612096.0,
+            "288": 428612096.0,
+            "289": 428612096.0,
+            "290": 428612096.0,
+            "291": 428612096.0,
+            "292": 428612096.0,
+            "293": 428612096.0,
+            "294": 428612096.0,
+            "295": 428612096.0,
+            "296": 428612096.0,
+            "297": 428612096.0,
+            "298": 428612096.0,
+            "299": 428612096.0,
+            "300": 428612096.0,
+            "301": 428612096.0,
+            "302": 428612096.0,
+            "303": 428612096.0,
+            "304": 428612096.0,
+            "305": 428612096.0,
+            "306": 428612096.0,
+            "307": 428612096.0,
+            "308": 428612096.0,
+            "309": 428612096.0,
+            "310": 428612096.0,
+            "311": 428612096.0,
+            "312": 428612096.0,
+            "313": 428612096.0,
+            "314": 428612096.0,
+            "315": 428612096.0,
+            "316": 428612096.0,
+            "317": 428612096.0,
+            "318": 428612096.0,
+            "319": 428612096.0,
+            "320": 428612096.0,
+            "321": 428612096.0,
+            "322": 428612096.0,
+            "323": 428612096.0,
+            "324": 428612096.0,
+            "325": 428612096.0,
+            "326": 428612096.0,
+            "327": 428612096.0,
+            "328": 428612096.0,
+            "329": 428612096.0,
+            "330": 428612096.0,
+            "331": 428612096.0,
+            "332": 428612096.0,
+            "333": 428612096.0,
+            "334": 428612096.0,
+            "335": 428612096.0,
+            "336": 428612096.0,
+            "337": 428612096.0,
+            "338": 428612096.0,
+            "339": 428612096.0,
+            "340": 428612096.0,
+            "341": 428612096.0,
+            "342": 428612096.0,
+            "343": 428612096.0,
+            "344": 428612096.0,
+            "345": 428612096.0,
+            "346": 428612096.0,
+            "347": 428612096.0,
+            "348": 428612096.0,
+            "349": 428612096.0,
+            "350": 428612096.0,
+            "351": 428612096.0,
+            "352": 428612096.0,
+            "353": 428612096.0,
+            "354": 428612096.0,
+            "355": 428612096.0,
+            "356": 428612096.0,
+            "357": 428612096.0,
+            "358": 428612096.0,
+            "359": 428612096.0,
+            "360": 428612096.0,
+            "361": 428612096.0,
+            "362": 428612096.0,
+            "363": 428612096.0,
+            "364": 428612096.0,
+            "365": 428612096.0,
+            "366": 428612096.0,
+            "367": 428612096.0,
+            "368": 428612096.0,
+            "369": 428612096.0,
+            "370": 428612096.0,
+            "371": 428612096.0,
+            "372": 428612096.0,
+            "373": 428612096.0,
+            "374": 428612096.0,
+            "375": 428612096.0,
+            "376": 428612096.0,
+            "377": 428612096.0,
+            "378": 428612096.0,
+            "379": 428612096.0,
+            "380": 428612096.0,
+            "381": 428612096.0,
+            "382": 428612096.0,
+            "383": 428612096.0,
+            "384": 428612096.0,
+            "385": 428612096.0,
+            "386": 428612096.0,
+            "387": 428612096.0,
+            "388": 428612096.0,
+            "389": 428612096.0,
+            "390": 428612096.0,
+            "391": 428612096.0,
+            "392": 428612096.0,
+            "393": 428612096.0,
+            "394": 428612096.0,
+            "395": 428612096.0,
+            "396": 428612096.0,
+            "397": 428612096.0,
+            "398": 428612096.0,
+            "399": 428612096.0,
+            "400": 428612096.0,
+            "401": 428612096.0,
+            "402": 428612096.0,
+            "403": 428612096.0,
+            "404": 428612096.0,
+            "405": 428612096.0,
+            "406": 428612096.0,
+            "407": 428612096.0,
+            "408": 428612096.0,
+            "409": 428612096.0,
+            "410": 428612096.0,
+            "411": 428612096.0,
+            "412": 428612096.0,
+            "413": 428612096.0,
+            "414": 428612096.0,
+            "415": 428612096.0,
+            "416": 428612096.0,
+            "417": 428612096.0,
+            "418": 428612096.0,
+            "419": 428612096.0,
+            "420": 428612096.0,
+            "421": 428612096.0,
+            "422": 428612096.0,
+            "423": 428612096.0,
+            "424": 428612096.0,
+            "425": 428612096.0,
+            "426": 428612096.0,
+            "427": 428612096.0,
+            "428": 428612096.0,
+            "429": 428612096.0,
+            "430": 428612096.0,
+            "431": 428612096.0,
+            "432": 428612096.0,
+            "433": 428612096.0,
+            "434": 428612096.0,
+            "435": 428612096.0,
+            "436": 428612096.0,
+            "437": 428612096.0,
+            "438": 428612096.0,
+            "439": 428612096.0,
+            "440": 428612096.0,
+            "441": 428612096.0,
+            "442": 428612096.0,
+            "443": 428612096.0,
+            "444": 428612096.0,
+            "445": 428612096.0,
+            "446": 428612096.0,
+            "447": 428612096.0,
+            "448": 428612096.0,
+            "449": 428612096.0,
+            "450": 428612096.0,
+            "451": 428612096.0,
+            "452": 428612096.0,
+            "453": 428612096.0,
+            "454": 428612096.0,
+            "455": 428612096.0,
+            "456": 428612096.0,
+            "457": 428612096.0,
+            "458": 428612096.0,
+            "459": 428612096.0,
+            "460": 428612096.0,
+            "461": 428612096.0,
+            "462": 428612096.0,
+            "463": 428612096.0,
+            "464": 428612096.0,
+            "465": 428612096.0,
+            "466": 428612096.0,
+            "467": 428612096.0,
+            "468": 428612096.0,
+            "469": 428612096.0,
+            "470": 428612096.0,
+            "471": 428612096.0,
+            "472": 428612096.0,
+            "473": 428612096.0,
+            "474": 428612096.0,
+            "475": 428612096.0,
+            "476": 428612096.0,
+            "477": 428612096.0,
+            "478": 428612096.0,
+            "479": 428612096.0,
+            "480": 428612096.0,
+            "481": 428612096.0,
+            "482": 428612096.0,
+            "483": 428612096.0,
+            "484": 428612096.0,
+            "485": 428612096.0,
+            "486": 428612096.0,
+            "487": 428612096.0,
+            "488": 428612096.0,
+            "489": 428612096.0,
+            "490": 428612096.0,
+            "491": 428612096.0,
+            "492": 428612096.0,
+            "493": 428612096.0,
+            "494": 428612096.0,
+            "495": 428612096.0,
+            "496": 428612096.0,
+            "497": 428612096.0,
+            "498": 428612096.0,
+            "499": 428612096.0,
+            "500": 428612096.0,
+            "501": 428612096.0,
+            "502": 428612096.0,
+            "503": 428612096.0,
+            "504": 428612096.0,
+            "505": 428612096.0,
+            "506": 428612096.0,
+            "507": 428612096.0,
+            "508": 428612096.0,
+            "509": 428612096.0,
+            "510": 428612096.0,
+            "511": 428612096.0,
+            "512": 428612096.0,
+            "513": 428612096.0,
+            "514": 428612096.0,
+            "515": 428612096.0,
+            "516": 428612096.0,
+            "517": 428612096.0,
+            "518": 428612096.0,
+            "519": 428612096.0,
+            "520": 428612096.0,
+            "521": 428612096.0,
+            "522": 428612096.0,
+            "523": 428612096.0,
+            "524": 428612096.0,
+            "525": 428612096.0,
+            "526": 428612096.0,
+            "527": 428612096.0,
+            "528": 428612096.0,
+            "529": 428612096.0,
+            "530": 428612096.0,
+            "531": 428612096.0,
+            "532": 428612096.0,
+            "533": 428612096.0,
+            "534": 428612096.0,
+            "535": 428612096.0,
+            "536": 428612096.0,
+            "537": 428612096.0,
+            "538": 428612096.0,
+            "539": 428612096.0,
+            "540": 428612096.0,
+            "541": 428612096.0,
+            "542": 428612096.0,
+            "543": 428612096.0,
+            "544": 428612096.0,
+            "545": 428612096.0,
+            "546": 428612096.0,
+            "547": 428612096.0,
+            "548": 428612096.0,
+            "549": 428612096.0,
+            "550": 428612096.0,
+            "551": 428612096.0,
+            "552": 428612096.0,
+            "553": 428612096.0,
+            "554": 428612096.0,
+            "555": 428612096.0,
+            "556": 428612096.0,
+            "557": 428612096.0,
+            "558": 428612096.0,
+            "559": 428612096.0,
+            "560": 428612096.0,
+            "561": 428612096.0,
+            "562": 428612096.0,
+            "563": 428612096.0,
+            "564": 428612096.0,
+            "565": 428612096.0,
+            "566": 428612096.0,
+            "567": 428612096.0,
+            "568": 428612096.0,
+            "569": 428612096.0,
+            "570": 428612096.0,
+            "571": 428612096.0,
+            "572": 428612096.0,
+            "573": 428612096.0,
+            "574": 428612096.0,
+            "575": 428612096.0,
+            "576": 428612096.0,
+            "577": 428612096.0,
+            "578": 428612096.0,
+            "579": 428612096.0,
+            "580": 428612096.0,
+            "581": 428612096.0,
+            "582": 428612096.0,
+            "583": 428612096.0,
+            "584": 428612096.0,
+            "585": 428612096.0,
+            "586": 428612096.0,
+            "587": 428612096.0,
+            "588": 428612096.0,
+            "589": 428612096.0,
+            "590": 428612096.0,
+            "591": 428612096.0,
+            "592": 428612096.0,
+            "593": 428612096.0,
+            "594": 428612096.0,
+            "595": 428612096.0,
+            "596": 428612096.0,
+            "597": 428612096.0,
+            "598": 428612096.0,
+            "599": 428612096.0,
+            "600": 428612096.0,
+            "601": 428612096.0,
+            "602": 428612096.0,
+            "603": 428612096.0,
+            "604": 428612096.0,
+            "605": 428612096.0,
+            "606": 428612096.0,
+            "607": 428612096.0,
+            "608": 428612096.0,
+            "609": 428612096.0,
+            "610": 428612096.0,
+            "611": 428612096.0,
+            "612": 428612096.0,
+            "613": 428612096.0,
+            "614": 428612096.0,
+            "615": 428612096.0,
+            "616": 428612096.0,
+            "617": 428612096.0,
+            "618": 428612096.0,
+            "619": 428612096.0,
+            "620": 428612096.0,
+            "621": 428612096.0,
+            "622": 428612096.0,
+            "623": 428612096.0,
+            "624": 428612096.0,
+            "625": 428612096.0,
+            "626": 428612096.0,
+            "627": 428612096.0,
+            "628": 428612096.0,
+            "629": 428612096.0,
+            "630": 428612096.0,
+            "631": 428612096.0,
+            "632": 428612096.0,
+            "633": 428612096.0,
+            "634": 428612096.0,
+            "635": 428612096.0,
+            "636": 428612096.0,
+            "637": 428612096.0,
+            "638": 428612096.0,
+            "639": 428612096.0,
+            "640": 428612096.0,
+            "641": 428612096.0,
+            "642": 428612096.0,
+            "643": 428612096.0,
+            "644": 428612096.0,
+            "645": 428612096.0,
+            "646": 428612096.0,
+            "647": 428612096.0,
+            "648": 428612096.0,
+            "649": 428612096.0,
+            "650": 428612096.0,
+            "651": 428612096.0,
+            "652": 428612096.0,
+            "653": 428612096.0,
+            "654": 428612096.0,
+            "655": 428612096.0,
+            "656": 428612096.0,
+            "657": 428612096.0,
+            "658": 428612096.0,
+            "659": 428612096.0,
+            "660": 428612096.0,
+            "661": 428612096.0,
+            "662": 428612096.0,
+            "663": 428612096.0,
+            "664": 428612096.0,
+            "665": 428612096.0,
+            "666": 428612096.0,
+            "667": 428612096.0,
+            "668": 428612096.0,
+            "669": 428612096.0,
+            "670": 428612096.0,
+            "671": 428612096.0,
+            "672": 428612096.0,
+            "673": 428612096.0,
+            "674": 428612096.0,
+            "675": 428612096.0,
+            "676": 428612096.0,
+            "677": 428612096.0,
+            "678": 428612096.0,
+            "679": 428612096.0,
+            "680": 428612096.0,
+            "681": 428612096.0,
+            "682": 428612096.0,
+            "683": 428612096.0,
+            "684": 428612096.0,
+            "685": 428612096.0,
+            "686": 428612096.0,
+            "687": 428612096.0,
+            "688": 428612096.0,
+            "689": 428612096.0,
+            "690": 428612096.0,
+            "691": 428612096.0,
+            "692": 428612096.0,
+            "693": 428612096.0,
+            "694": 428612096.0,
+            "695": 428612096.0,
+            "696": 428612096.0,
+            "697": 428612096.0,
+            "698": 428612096.0,
+            "699": 428612096.0,
+            "700": 428612096.0,
+            "701": 428612096.0,
+            "702": 428612096.0,
+            "703": 428612096.0,
+            "704": 428612096.0,
+            "705": 428612096.0,
+            "706": 428612096.0,
+            "707": 428612096.0,
+            "708": 428612096.0,
+            "709": 428612096.0,
+            "710": 428612096.0,
+            "711": 428612096.0,
+            "712": 428612096.0,
+            "713": 428612096.0,
+            "714": 428612096.0,
+            "715": 428612096.0,
+            "716": 428612096.0,
+            "717": 428612096.0,
+            "718": 428612096.0,
+            "719": 428612096.0,
+            "720": 428612096.0,
+            "721": 428612096.0,
+            "722": 428612096.0,
+            "723": 428612096.0,
+            "724": 428612096.0,
+            "725": 428612096.0,
+            "726": 428612096.0,
+            "727": 428612096.0,
+            "728": 428612096.0,
+            "729": 428612096.0,
+            "730": 428612096.0,
+            "731": 428612096.0,
+            "732": 428612096.0,
+            "733": 428612096.0,
+            "734": 428612096.0,
+            "735": 428612096.0,
+            "736": 428612096.0,
+            "737": 428612096.0,
+            "738": 428612096.0,
+            "739": 428612096.0,
+            "740": 428612096.0,
+            "741": 428612096.0,
+            "742": 428612096.0,
+            "743": 428612096.0,
+            "744": 428612096.0,
+            "745": 428612096.0,
+            "746": 428612096.0,
+            "747": 428612096.0,
+            "748": 428612096.0,
+            "749": 428612096.0,
+            "750": 428612096.0,
+            "751": 428612096.0,
+            "752": 428612096.0,
+            "753": 428612096.0,
+            "754": 428612096.0,
+            "755": 428612096.0,
+            "756": 428612096.0,
+            "757": 428612096.0,
+            "758": 428612096.0,
+            "759": 428612096.0,
+            "760": 428612096.0,
+            "761": 428612096.0,
+            "762": 428612096.0,
+            "763": 428612096.0,
+            "764": 428612096.0,
+            "765": 428612096.0,
+            "766": 428612096.0,
+            "767": 428612096.0,
+            "768": 428612096.0,
+            "769": 428612096.0,
+            "770": 428612096.0,
+            "771": 428612096.0,
+            "772": 428612096.0,
+            "773": 428612096.0,
+            "774": 428612096.0,
+            "775": 428612096.0,
+            "776": 428612096.0,
+            "777": 428612096.0,
+            "778": 428612096.0,
+            "779": 428612096.0,
+            "780": 428612096.0,
+            "781": 428612096.0,
+            "782": 428612096.0,
+            "783": 428612096.0,
+            "784": 428612096.0,
+            "785": 428612096.0,
+            "786": 428612096.0,
+            "787": 428612096.0,
+            "788": 428612096.0,
+            "789": 428612096.0,
+            "790": 428612096.0,
+            "791": 428612096.0,
+            "792": 428612096.0,
+            "793": 428612096.0,
+            "794": 428612096.0,
+            "795": 428612096.0,
+            "796": 428612096.0,
+            "797": 428612096.0,
+            "798": 428612096.0,
+            "799": 428612096.0,
+            "800": 428612096.0,
+            "801": 428612096.0,
+            "802": 428612096.0,
+            "803": 428612096.0,
+            "804": 428612096.0,
+            "805": 428612096.0,
+            "806": 428612096.0,
+            "807": 428612096.0,
+            "808": 428612096.0,
+            "809": 428612096.0,
+            "810": 428612096.0,
+            "811": 428612096.0,
+            "812": 428612096.0,
+            "813": 428612096.0,
+            "814": 428612096.0,
+            "815": 428612096.0,
+            "816": 428612096.0,
+            "817": 428612096.0,
+            "818": 428612096.0,
+            "819": 428612096.0,
+            "820": 428612096.0,
+            "821": 428612096.0,
+            "822": 428612096.0,
+            "823": 428612096.0,
+            "824": 428612096.0,
+            "825": 428612096.0,
+            "826": 428612096.0,
+            "827": 428612096.0,
+            "828": 428612096.0,
+            "829": 428612096.0,
+            "830": 428612096.0,
+            "831": 428612096.0,
+            "832": 428612096.0,
+            "833": 428612096.0,
+            "834": 428612096.0,
+            "835": 428612096.0,
+            "836": 428612096.0,
+            "837": 428612096.0,
+            "838": 428612096.0,
+            "839": 428612096.0,
+            "840": 428612096.0,
+            "841": 428612096.0,
+            "842": 428612096.0,
+            "843": 428612096.0,
+            "844": 428612096.0,
+            "845": 428612096.0,
+            "846": 428612096.0,
+            "847": 428612096.0,
+            "848": 428612096.0,
+            "849": 428612096.0,
+            "850": 428612096.0,
+            "851": 428612096.0,
+            "852": 428612096.0,
+            "853": 428612096.0,
+            "854": 428612096.0,
+            "855": 428612096.0,
+            "856": 428612096.0,
+            "857": 428612096.0,
+            "858": 428612096.0,
+            "859": 428612096.0,
+            "860": 428612096.0,
+            "861": 428612096.0,
+            "862": 428612096.0,
+            "863": 428612096.0,
+            "864": 428612096.0,
+            "865": 428612096.0,
+            "866": 428612096.0,
+            "867": 428612096.0,
+            "868": 428612096.0,
+            "869": 428612096.0,
+            "870": 428612096.0,
+            "871": 428612096.0,
+            "872": 428612096.0,
+            "873": 428612096.0,
+            "874": 428612096.0,
+            "875": 428612096.0,
+            "876": 428612096.0,
+            "877": 428612096.0,
+            "878": 428612096.0,
+            "879": 428612096.0,
+            "880": 428612096.0,
+            "881": 428612096.0,
+            "882": 428612096.0,
+            "883": 428612096.0,
+            "884": 428612096.0,
+            "885": 428612096.0,
+            "886": 428612096.0,
+            "887": 428612096.0,
+            "888": 428612096.0,
+            "889": 428612096.0,
+            "890": 428612096.0,
+            "891": 428612096.0,
+            "892": 428612096.0,
+            "893": 428612096.0,
+            "894": 428612096.0,
+            "895": 428612096.0,
+            "896": 428612096.0,
+            "897": 428612096.0,
+            "898": 428612096.0,
+            "899": 428612096.0,
+            "900": 428612096.0,
+            "901": 428612096.0,
+            "902": 428612096.0,
+            "903": 428612096.0,
+            "904": 428612096.0,
+            "905": 428612096.0,
+            "906": 428612096.0,
+            "907": 428612096.0,
+            "908": 428612096.0,
+            "909": 428612096.0,
+            "910": 428612096.0,
+            "911": 428612096.0,
+            "912": 428612096.0,
+            "913": 428612096.0,
+            "914": 428612096.0,
+            "915": 428612096.0,
+            "916": 428612096.0,
+            "917": 428612096.0,
+            "918": 428612096.0,
+            "919": 428612096.0,
+            "920": 428612096.0,
+            "921": 428612096.0,
+            "922": 428612096.0,
+            "923": 428612096.0,
+            "924": 428612096.0,
+            "925": 428612096.0,
+            "926": 428612096.0,
+            "927": 428612096.0,
+            "928": 428612096.0,
+            "929": 428612096.0,
+            "930": 428612096.0,
+            "931": 428612096.0,
+            "932": 428612096.0,
+            "933": 428612096.0,
+            "934": 428612096.0,
+            "935": 428612096.0,
+            "936": 428612096.0,
+            "937": 428612096.0,
+            "938": 428612096.0,
+            "939": 428612096.0,
+            "940": 428612096.0,
+            "941": 428612096.0,
+            "942": 428612096.0,
+            "943": 428612096.0,
+            "944": 428612096.0,
+            "945": 428612096.0,
+            "946": 428612096.0,
+            "947": 428612096.0,
+            "948": 428612096.0,
+            "949": 428612096.0,
+            "950": 428612096.0,
+            "951": 428612096.0,
+            "952": 428612096.0,
+            "953": 428612096.0,
+            "954": 428612096.0,
+            "955": 428612096.0,
+            "956": 428612096.0,
+            "957": 428612096.0,
+            "958": 428612096.0,
+            "959": 428612096.0,
+            "960": 428612096.0,
+            "961": 428612096.0,
+            "962": 428612096.0,
+            "963": 428612096.0,
+            "964": 428612096.0,
+            "965": 428612096.0,
+            "966": 428612096.0,
+            "967": 428612096.0,
+            "968": 428612096.0,
+            "969": 428612096.0,
+            "970": 428612096.0,
+            "971": 428612096.0,
+            "972": 428612096.0,
+            "973": 428612096.0,
+            "974": 428612096.0,
+            "975": 428612096.0,
+            "976": 428612096.0,
+            "977": 428612096.0,
+            "978": 428612096.0,
+            "979": 428612096.0,
+            "980": 428612096.0,
+            "981": 428612096.0,
+            "982": 428612096.0,
+            "983": 428612096.0,
+            "984": 428612096.0,
+            "985": 428612096.0,
+            "986": 428612096.0,
+            "987": 428612096.0,
+            "988": 428612096.0,
+            "989": 428612096.0,
+            "990": 428612096.0,
+            "991": 428612096.0,
+            "992": 428612096.0,
+            "993": 428612096.0,
+            "994": 428612096.0,
+            "995": 428612096.0,
+            "996": 428612096.0,
+            "997": 428612096.0,
+            "998": 428612096.0,
+            "999": 428612096.0,
+            "1000": 428612096.0,
+            "1001": 428612096.0,
+            "1002": 428612096.0,
+            "1003": 428612096.0,
+            "1004": 428612096.0,
+            "1005": 428612096.0,
+            "1006": 428612096.0,
+            "1007": 428612096.0,
+            "1008": 428612096.0,
+            "1009": 428612096.0,
+            "1010": 428612096.0,
+            "1011": 428612096.0,
+            "1012": 428612096.0,
+            "1013": 428612096.0,
+            "1014": 428612096.0,
+            "1015": 428612096.0,
+            "1016": 428612096.0,
+            "1017": 428612096.0,
+            "1018": 428612096.0,
+            "1019": 428612096.0,
+            "1020": 428612096.0,
+            "1021": 428612096.0,
+            "1022": 428612096.0,
+            "1023": 428612096.0,
+            "1024": 428612096.0,
+            "1025": 428612096.0,
+            "1026": 428612096.0,
+            "1027": 428612096.0,
+            "1028": 428612096.0,
+            "1029": 428612096.0,
+            "1030": 428612096.0,
+            "1031": 428612096.0,
+            "1032": 428612096.0,
+            "1033": 428612096.0,
+            "1034": 428612096.0,
+            "1035": 428612096.0,
+            "1036": 428612096.0,
+            "1037": 428612096.0,
+            "1038": 428612096.0,
+            "1039": 428612096.0,
+            "1040": 428612096.0,
+            "1041": 428612096.0,
+            "1042": 428612096.0,
+            "1043": 428612096.0,
+            "1044": 428612096.0,
+            "1045": 428612096.0,
+            "1046": 428612096.0,
+            "1047": 428612096.0,
+            "1048": 428612096.0,
+            "1049": 428612096.0,
+            "1050": 428612096.0,
+            "1051": 428612096.0,
+            "1052": 428612096.0,
+            "1053": 428612096.0,
+            "1054": 428612096.0,
+            "1055": 428612096.0,
+            "1056": 428612096.0,
+            "1057": 428612096.0,
+            "1058": 428612096.0,
+            "1059": 428612096.0,
+            "1060": 428612096.0,
+            "1061": 428612096.0,
+            "1062": 428612096.0,
+            "1063": 428612096.0,
+            "1064": 428612096.0,
+            "1065": 428612096.0,
+            "1066": 428612096.0,
+            "1067": 428612096.0,
+            "1068": 428612096.0,
+            "1069": 428612096.0,
+            "1070": 428612096.0,
+            "1071": 428612096.0,
+            "1072": 428612096.0,
+            "1073": 428612096.0,
+            "1074": 428612096.0,
+            "1075": 428612096.0,
+            "1076": 428612096.0,
+            "1077": 428612096.0,
+            "1078": 428612096.0,
+            "1079": 428612096.0,
+            "1080": 428612096.0,
+            "1081": 428612096.0,
+            "1082": 428612096.0,
+            "1083": 428612096.0,
+            "1084": 428612096.0,
+            "1085": 428612096.0,
+            "1086": 428612096.0,
+            "1087": 428612096.0,
+            "1088": 428612096.0,
+            "1089": 428612096.0,
+            "1090": 428612096.0,
+            "1091": 428612096.0,
+            "1092": 428612096.0,
+            "1093": 428612096.0,
+            "1094": 428612096.0,
+            "1095": 428612096.0,
+            "1096": 428612096.0,
+            "1097": 428612096.0,
+            "1098": 428612096.0,
+            "1099": 428612096.0,
+            "1100": 428612096.0,
+            "1101": 428612096.0,
+            "1102": 428612096.0,
+            "1103": 428612096.0,
+            "1104": 428612096.0,
+            "1105": 428612096.0,
+            "1106": 428612096.0,
+            "1107": 428612096.0,
+            "1108": 428612096.0,
+            "1109": 428612096.0,
+            "1110": 428612096.0,
+            "1111": 428612096.0,
+            "1112": 428612096.0,
+            "1113": 428612096.0,
+            "1114": 428612096.0,
+            "1115": 428612096.0,
+            "1116": 428612096.0,
+            "1117": 428612096.0,
+            "1118": 428612096.0,
+            "1119": 428612096.0,
+            "1120": 428612096.0,
+            "1121": 428612096.0,
+            "1122": 428612096.0,
+            "1123": 428612096.0,
+            "1124": 428612096.0,
+            "1125": 428612096.0,
+            "1126": 428612096.0,
+            "1127": 428612096.0,
+            "1128": 428612096.0,
+            "1129": 428612096.0,
+            "1130": 428612096.0,
+            "1131": 428612096.0,
+            "1132": 428612096.0,
+            "1133": 428612096.0,
+            "1134": 428612096.0,
+            "1135": 428612096.0,
+            "1136": 428612096.0,
+            "1137": 428612096.0,
+            "1138": 428612096.0,
+            "1139": 428612096.0,
+            "1140": 428612096.0,
+            "1141": 428612096.0,
+            "1142": 428612096.0,
+            "1143": 428612096.0,
+            "1144": 428612096.0,
+            "1145": 428612096.0,
+            "1146": 428612096.0,
+            "1147": 428612096.0,
+            "1148": 428612096.0,
+            "1149": 428612096.0,
+            "1150": 428612096.0,
+            "1151": 428612096.0,
+            "1152": 428612096.0,
+            "1153": 428612096.0,
+            "1154": 428612096.0,
+            "1155": 428612096.0,
+            "1156": 428612096.0,
+            "1157": 428612096.0,
+            "1158": 428612096.0,
+            "1159": 428612096.0,
+            "1160": 428612096.0,
+            "1161": 428612096.0,
+            "1162": 428612096.0,
+            "1163": 428612096.0,
+            "1164": 428612096.0,
+            "1165": 428612096.0,
+            "1166": 428612096.0,
+            "1167": 428612096.0,
+            "1168": 428612096.0,
+            "1169": 428612096.0,
+            "1170": 428612096.0,
+            "1171": 428612096.0,
+            "1172": 428612096.0,
+            "1173": 428612096.0,
+            "1174": 428612096.0,
+            "1175": 428612096.0,
+            "1176": 428612096.0,
+            "1177": 428612096.0,
+            "1178": 428612096.0,
+            "1179": 428612096.0,
+            "1180": 428612096.0,
+            "1181": 428612096.0,
+            "1182": 428612096.0,
+            "1183": 428612096.0,
+            "1184": 428612096.0,
+            "1185": 428612096.0,
+            "1186": 428612096.0,
+            "1187": 428612096.0,
+            "1188": 428612096.0,
+            "1189": 428612096.0,
+            "1190": 428612096.0,
+            "1191": 428612096.0,
+            "1192": 428612096.0,
+            "1193": 428612096.0,
+            "1194": 428612096.0,
+            "1195": 428612096.0,
+            "1196": 428612096.0,
+            "1197": 428612096.0,
+            "1198": 428612096.0,
+            "1199": 428612096.0,
+            "1200": 428612096.0,
+            "1201": 428612096.0,
+            "1202": 428612096.0,
+            "1203": 428612096.0,
+            "1204": 428612096.0,
+            "1205": 428612096.0,
+            "1206": 428612096.0,
+            "1207": 428612096.0,
+            "1208": 428612096.0,
+            "1209": 428612096.0,
+            "1210": 428612096.0,
+            "1211": 428612096.0,
+            "1212": 428612096.0,
+            "1213": 428612096.0,
+            "1214": 428612096.0,
+            "1215": 428612096.0,
+            "1216": 428612096.0,
+            "1217": 428612096.0,
+            "1218": 428612096.0,
+            "1219": 428612096.0,
+            "1220": 428612096.0,
+            "1221": 428612096.0,
+            "1222": 428612096.0,
+            "1223": 428612096.0,
+            "1224": 428612096.0,
+            "1225": 428612096.0,
+            "1226": 428612096.0,
+            "1227": 428612096.0,
+            "1228": 428612096.0,
+            "1229": 428612096.0,
+            "1230": 428612096.0,
+            "1231": 428612096.0,
+            "1232": 428612096.0,
+            "1233": 428612096.0,
+            "1234": 428612096.0,
+            "1235": 428612096.0,
+            "1236": 428612096.0,
+            "1237": 428612096.0,
+            "1238": 428612096.0,
+            "1239": 428612096.0,
+            "1240": 428612096.0,
+            "1241": 428612096.0,
+            "1242": 428612096.0,
+            "1243": 428612096.0,
+            "1244": 428612096.0,
+            "1245": 428612096.0,
+            "1246": 428612096.0,
+            "1247": 428612096.0,
+            "1248": 428612096.0,
+            "1249": 428612096.0,
+            "1250": 428612096.0,
+            "1251": 428612096.0,
+            "1252": 428612096.0,
+            "1253": 428612096.0,
+            "1254": 428612096.0,
+            "1255": 428612096.0,
+            "1256": 428612096.0,
+            "1257": 428612096.0,
+            "1258": 428612096.0,
+            "1259": 428612096.0,
+            "1260": 428612096.0,
+            "1261": 428612096.0,
+            "1262": 428612096.0,
+            "1263": 428612096.0,
+            "1264": 428612096.0,
+            "1265": 428612096.0,
+            "1266": 428612096.0,
+            "1267": 428612096.0,
+            "1268": 428612096.0,
+            "1269": 428612096.0,
+            "1270": 428612096.0,
+            "1271": 428612096.0,
+            "1272": 428612096.0,
+            "1273": 428612096.0,
+            "1274": 428612096.0,
+            "1275": 428612096.0,
+            "1276": 428612096.0,
+            "1277": 428612096.0,
+            "1278": 428612096.0,
+            "1279": 428612096.0,
+            "1280": 428612096.0,
+            "1281": 428612096.0,
+            "1282": 428612096.0,
+            "1283": 428612096.0,
+            "1284": 428612096.0,
+            "1285": 428612096.0,
+            "1286": 428612096.0,
+            "1287": 428612096.0,
+            "1288": 428612096.0,
+            "1289": 428612096.0,
+            "1290": 428612096.0,
+            "1291": 428612096.0,
+            "1292": 428612096.0,
+            "1293": 428612096.0,
+            "1294": 428612096.0,
+            "1295": 428612096.0,
+            "1296": 428612096.0,
+            "1297": 428612096.0,
+            "1298": 428612096.0,
+            "1299": 428612096.0,
+            "1300": 428612096.0,
+            "1301": 428612096.0,
+            "1302": 428612096.0,
+            "1303": 428612096.0,
+            "1304": 428612096.0,
+            "1305": 428612096.0,
+            "1306": 428612096.0,
+            "1307": 428612096.0,
+            "1308": 428612096.0,
+            "1309": 428612096.0,
+            "1310": 428612096.0,
+            "1311": 428612096.0,
+            "1312": 428612096.0,
+            "1313": 428612096.0,
+            "1314": 428612096.0,
+            "1315": 428612096.0,
+            "1316": 428612096.0,
+            "1317": 428612096.0,
+            "1318": 428612096.0,
+            "1319": 428612096.0,
+            "1320": 428612096.0,
+            "1321": 428612096.0,
+            "1322": 428612096.0,
+            "1323": 428612096.0,
+            "1324": 428612096.0,
+            "1325": 428612096.0,
+            "1326": 428612096.0,
+            "1327": 428612096.0,
+            "1328": 428612096.0,
+            "1329": 428612096.0,
+            "1330": 428612096.0,
+            "1331": 428612096.0,
+            "1332": 428612096.0,
+            "1333": 428612096.0,
+            "1334": 428612096.0,
+            "1335": 428612096.0,
+            "1336": 428612096.0,
+            "1337": 428612096.0,
+            "1338": 428612096.0,
+            "1339": 428612096.0,
+            "1340": 428612096.0,
+            "1341": 428612096.0,
+            "1342": 428612096.0,
+            "1343": 428612096.0,
+            "1344": 428612096.0,
+            "1345": 428612096.0,
+            "1346": 428612096.0,
+            "1347": 428612096.0,
+            "1348": 428612096.0,
+            "1349": 428612096.0,
+            "1350": 428612096.0,
+            "1351": 428612096.0,
+            "1352": 428612096.0,
+            "1353": 428612096.0,
+            "1354": 428612096.0,
+            "1355": 428612096.0,
+            "1356": 428612096.0,
+            "1357": 428612096.0,
+            "1358": 428612096.0,
+            "1359": 428612096.0,
+            "1360": 428612096.0,
+            "1361": 428612096.0,
+            "1362": 428612096.0,
+            "1363": 428612096.0,
+            "1364": 428612096.0,
+            "1365": 428612096.0,
+            "1366": 428612096.0,
+            "1367": 428612096.0,
+            "1368": 428612096.0,
+            "1369": 428612096.0,
+            "1370": 428612096.0,
+            "1371": 428612096.0,
+            "1372": 428612096.0,
+            "1373": 428612096.0,
+            "1374": 428612096.0,
+            "1375": 428612096.0,
+            "1376": 428612096.0,
+            "1377": 428612096.0,
+            "1378": 428612096.0,
+            "1379": 428612096.0,
+            "1380": 428612096.0,
+            "1381": 428612096.0,
+            "1382": 428612096.0,
+            "1383": 428612096.0,
+            "1384": 428612096.0,
+            "1385": 428612096.0,
+            "1386": 428612096.0,
+            "1387": 428612096.0,
+            "1388": 428612096.0,
+            "1389": 428612096.0,
+            "1390": 428612096.0,
+            "1391": 428612096.0,
+            "1392": 428612096.0,
+            "1393": 428612096.0,
+            "1394": 428612096.0,
+            "1395": 428612096.0,
+            "1396": 428612096.0,
+            "1397": 428612096.0,
+            "1398": 428612096.0,
+            "1399": 428612096.0,
+            "1400": 428612096.0,
+            "1401": 428612096.0,
+            "1402": 428612096.0,
+            "1403": 428612096.0,
+            "1404": 428612096.0,
+            "1405": 428612096.0,
+            "1406": 428612096.0,
+            "1407": 428612096.0,
+            "1408": 428612096.0,
+            "1409": 428612096.0,
+            "1410": 428612096.0,
+            "1411": 428612096.0,
+            "1412": 428612096.0,
+            "1413": 428612096.0,
+            "1414": 428612096.0,
+            "1415": 428612096.0,
+            "1416": 428612096.0,
+            "1417": 428612096.0,
+            "1418": 428612096.0,
+            "1419": 428612096.0,
+            "1420": 428612096.0,
+            "1421": 428612096.0,
+            "1422": 428612096.0,
+            "1423": 428612096.0,
+            "1424": 428612096.0,
+            "1425": 428612096.0,
+            "1426": 428612096.0,
+            "1427": 428612096.0,
+            "1428": 428612096.0,
+            "1429": 428612096.0,
+            "1430": 428612096.0,
+            "1431": 428612096.0,
+            "1432": 428612096.0,
+            "1433": 428612096.0,
+            "1434": 428612096.0,
+            "1435": 428612096.0,
+            "1436": 428612096.0,
+            "1437": 428612096.0,
+            "1438": 428612096.0,
+            "1439": 428612096.0,
+            "1440": 428612096.0,
+            "1441": 428612096.0,
+            "1442": 428612096.0,
+            "1443": 428612096.0,
+            "1444": 428612096.0,
+            "1445": 428612096.0,
+            "1446": 428612096.0,
+            "1447": 428612096.0,
+            "1448": 428612096.0,
+            "1449": 428612096.0,
+            "1450": 428612096.0,
+            "1451": 428612096.0,
+            "1452": 428612096.0,
+            "1453": 428612096.0,
+            "1454": 428612096.0,
+            "1455": 428612096.0,
+            "1456": 428612096.0,
+            "1457": 428612096.0,
+            "1458": 428612096.0,
+            "1459": 428612096.0,
+            "1460": 428612096.0,
+            "1461": 428612096.0,
+            "1462": 428612096.0,
+            "1463": 428612096.0,
+            "1464": 428612096.0,
+            "1465": 428612096.0,
+            "1466": 428612096.0,
+            "1467": 428612096.0,
+            "1468": 428612096.0,
+            "1469": 428612096.0,
+            "1470": 428612096.0,
+            "1471": 428612096.0,
+            "1472": 428612096.0,
+            "1473": 428612096.0,
+            "1474": 428612096.0,
+            "1475": 428612096.0,
+            "1476": 428612096.0,
+            "1477": 428612096.0,
+            "1478": 428612096.0,
+            "1479": 428612096.0,
+            "1480": 428612096.0,
+            "1481": 428612096.0,
+            "1482": 428612096.0,
+            "1483": 428612096.0,
+            "1484": 428612096.0,
+            "1485": 428612096.0,
+            "1486": 428612096.0,
+            "1487": 428612096.0,
+            "1488": 428612096.0,
+            "1489": 428612096.0,
+            "1490": 428612096.0,
+            "1491": 428612096.0,
+            "1492": 428612096.0,
+            "1493": 428612096.0,
+            "1494": 428612096.0,
+            "1495": 428612096.0,
+            "1496": 428612096.0,
+            "1497": 428612096.0,
+            "1498": 428612096.0,
+            "1499": 428612096.0,
+            "1500": 428612096.0,
+            "1501": 428612096.0,
+            "1502": 428612096.0,
+            "1503": 428612096.0,
+            "1504": 428612096.0,
+            "1505": 428612096.0,
+            "1506": 428612096.0,
+            "1507": 428612096.0,
+            "1508": 428612096.0,
+            "1509": 428612096.0,
+            "1510": 428612096.0,
+            "1511": 428612096.0,
+            "1512": 428612096.0,
+            "1513": 428612096.0,
+            "1514": 428612096.0,
+            "1515": 428612096.0,
+            "1516": 428612096.0,
+            "1517": 428612096.0,
+            "1518": 428612096.0,
+            "1519": 428612096.0,
+            "1520": 428612096.0,
+            "1521": 428612096.0,
+            "1522": 428612096.0,
+            "1523": 428612096.0,
+            "1524": 428612096.0,
+            "1525": 428612096.0,
+            "1526": 428612096.0,
+            "1527": 428612096.0,
+            "1528": 428612096.0,
+            "1529": 428612096.0,
+            "1530": 428612096.0,
+            "1531": 428612096.0,
+            "1532": 428612096.0,
+            "1533": 428612096.0,
+            "1534": 428612096.0,
+            "1535": 428612096.0,
+            "1536": 428612096.0,
+            "1537": 428612096.0,
+            "1538": 428612096.0,
+            "1539": 428612096.0,
+            "1540": 428612096.0,
+            "1541": 428612096.0,
+            "1542": 428612096.0,
+            "1543": 428612096.0,
+            "1544": 428612096.0,
+            "1545": 428612096.0,
+            "1546": 428612096.0,
+            "1547": 428612096.0,
+            "1548": 428612096.0,
+            "1549": 428612096.0,
+            "1550": 428612096.0,
+            "1551": 428612096.0,
+            "1552": 428612096.0,
+            "1553": 428612096.0,
+            "1554": 428612096.0,
+            "1555": 428612096.0,
+            "1556": 428612096.0,
+            "1557": 428612096.0,
+            "1558": 428612096.0,
+            "1559": 428612096.0,
+            "1560": 428612096.0,
+            "1561": 428612096.0,
+            "1562": 428612096.0,
+            "1563": 428612096.0,
+            "1564": 428612096.0,
+            "1565": 428612096.0,
+            "1566": 428612096.0,
+            "1567": 428612096.0,
+            "1568": 428612096.0,
+            "1569": 428612096.0,
+            "1570": 428612096.0,
+            "1571": 428612096.0,
+            "1572": 428612096.0,
+            "1573": 428612096.0,
+            "1574": 428612096.0,
+            "1575": 428612096.0,
+            "1576": 428612096.0,
+            "1577": 428612096.0,
+            "1578": 428612096.0,
+            "1579": 428612096.0,
+            "1580": 428612096.0,
+            "1581": 428612096.0,
+            "1582": 428612096.0,
+            "1583": 428612096.0,
+            "1584": 428612096.0,
+            "1585": 428612096.0,
+            "1586": 428612096.0,
+            "1587": 428612096.0,
+            "1588": 428612096.0,
+            "1589": 428612096.0,
+            "1590": 428612096.0,
+            "1591": 428612096.0,
+            "1592": 428612096.0,
+            "1593": 428612096.0,
+            "1594": 428612096.0,
+            "1595": 428612096.0,
+            "1596": 428612096.0,
+            "1597": 428612096.0,
+            "1598": 428612096.0,
+            "1599": 428612096.0,
+            "1600": 428612096.0,
+            "1601": 428612096.0,
+            "1602": 428612096.0,
+            "1603": 428612096.0,
+            "1604": 428612096.0,
+            "1605": 428612096.0,
+            "1606": 428612096.0,
+            "1607": 428612096.0,
+            "1608": 428612096.0,
+            "1609": 428612096.0,
+            "1610": 428612096.0,
+            "1611": 428612096.0,
+            "1612": 428612096.0,
+            "1613": 428612096.0,
+            "1614": 428612096.0,
+            "1615": 428612096.0,
+            "1616": 428612096.0,
+            "1617": 428612096.0,
+            "1618": 428612096.0,
+            "1619": 428612096.0,
+            "1620": 428612096.0,
+            "1621": 428612096.0,
+            "1622": 428612096.0,
+            "1623": 428612096.0,
+            "1624": 428612096.0,
+            "1625": 428612096.0,
+            "1626": 428612096.0,
+            "1627": 428612096.0,
+            "1628": 428612096.0,
+            "1629": 428612096.0,
+            "1630": 428612096.0,
+            "1631": 428612096.0,
+            "1632": 428612096.0,
+            "1633": 428612096.0,
+            "1634": 428612096.0,
+            "1635": 428612096.0,
+            "1636": 428612096.0,
+            "1637": 428612096.0,
+            "1638": 428612096.0,
+            "1639": 428612096.0,
+            "1640": 428612096.0,
+            "1641": 428612096.0,
+            "1642": 428612096.0,
+            "1643": 428612096.0,
+            "1644": 428612096.0,
+            "1645": 428612096.0,
+            "1646": 428612096.0,
+            "1647": 428612096.0,
+            "1648": 428612096.0,
+            "1649": 428612096.0,
+            "1650": 428612096.0,
+            "1651": 428612096.0,
+            "1652": 428612096.0,
+            "1653": 428612096.0,
+            "1654": 428612096.0,
+            "1655": 428612096.0,
+            "1656": 428612096.0,
+            "1657": 428612096.0,
+            "1658": 428612096.0,
+            "1659": 428612096.0,
+            "1660": 428612096.0,
+            "1661": 428612096.0,
+            "1662": 428612096.0,
+            "1663": 428612096.0,
+            "1664": 428612096.0,
+            "1665": 428612096.0,
+            "1666": 428612096.0,
+            "1667": 428612096.0,
+            "1668": 428612096.0,
+            "1669": 428612096.0,
+            "1670": 428612096.0,
+            "1671": 428612096.0,
+            "1672": 428612096.0,
+            "1673": 428612096.0,
+            "1674": 428612096.0,
+            "1675": 428612096.0,
+            "1676": 428612096.0,
+            "1677": 428612096.0,
+            "1678": 428612096.0,
+            "1679": 428612096.0,
+            "1680": 428612096.0,
+            "1681": 428612096.0,
+            "1682": 428612096.0,
+            "1683": 428612096.0,
+            "1684": 428612096.0,
+            "1685": 428612096.0,
+            "1686": 428612096.0,
+            "1687": 428612096.0,
+            "1688": 428612096.0,
+            "1689": 428612096.0,
+            "1690": 428612096.0,
+            "1691": 428612096.0,
+            "1692": 428612096.0,
+            "1693": 428612096.0,
+            "1694": 428612096.0,
+            "1695": 428612096.0,
+            "1696": 428612096.0,
+            "1697": 428612096.0,
+            "1698": 428612096.0,
+            "1699": 428612096.0,
+            "1700": 428612096.0,
+            "1701": 428612096.0,
+            "1702": 428612096.0,
+            "1703": 428612096.0,
+            "1704": 428612096.0,
+            "1705": 428612096.0,
+            "1706": 428612096.0,
+            "1707": 428612096.0,
+            "1708": 428612096.0,
+            "1709": 428612096.0,
+            "1710": 428612096.0,
+            "1711": 428612096.0,
+            "1712": 428612096.0,
+            "1713": 428612096.0,
+            "1714": 428612096.0,
+            "1715": 428612096.0,
+            "1716": 428612096.0,
+            "1717": 428612096.0,
+            "1718": 428612096.0,
+            "1719": 428612096.0,
+            "1720": 428612096.0,
+            "1721": 428612096.0,
+            "1722": 428612096.0,
+            "1723": 428612096.0,
+            "1724": 428612096.0,
+            "1725": 428612096.0,
+            "1726": 428612096.0,
+            "1727": 428612096.0,
+            "1728": 428612096.0,
+            "1729": 428612096.0,
+            "1730": 428612096.0,
+            "1731": 428612096.0,
+            "1732": 428612096.0,
+            "1733": 428612096.0,
+            "1734": 428612096.0,
+            "1735": 428612096.0,
+            "1736": 428612096.0,
+            "1737": 428612096.0,
+            "1738": 428612096.0,
+            "1739": 428612096.0,
+            "1740": 428612096.0,
+            "1741": 428612096.0,
+            "1742": 428612096.0,
+            "1743": 428612096.0,
+            "1744": 428612096.0,
+            "1745": 428612096.0,
+            "1746": 428612096.0,
+            "1747": 428612096.0,
+            "1748": 428612096.0,
+            "1749": 428612096.0,
+            "1750": 428612096.0,
+            "1751": 428612096.0,
+            "1752": 428612096.0,
+            "1753": 428612096.0,
+            "1754": 428612096.0,
+            "1755": 428612096.0,
+            "1756": 428612096.0,
+            "1757": 428612096.0,
+            "1758": 428612096.0,
+            "1759": 428612096.0,
+            "1760": 428612096.0,
+            "1761": 428612096.0,
+            "1762": 428612096.0,
+            "1763": 428612096.0,
+            "1764": 428612096.0,
+            "1765": 428612096.0,
+            "1766": 428612096.0,
+            "1767": 428612096.0,
+            "1768": 428612096.0,
+            "1769": 428612096.0,
+            "1770": 428612096.0,
+            "1771": 428612096.0,
+            "1772": 428612096.0,
+            "1773": 428612096.0,
+            "1774": 428612096.0,
+            "1775": 428612096.0,
+            "1776": 428612096.0,
+            "1777": 428612096.0,
+            "1778": 428612096.0,
+            "1779": 428612096.0,
+            "1780": 428612096.0,
+            "1781": 428612096.0,
+            "1782": 428612096.0,
+            "1783": 428612096.0,
+            "1784": 428612096.0,
+            "1785": 428612096.0,
+            "1786": 428612096.0,
+            "1787": 428612096.0,
+            "1788": 428612096.0,
+            "1789": 428612096.0,
+            "1790": 428612096.0,
+            "1791": 428612096.0,
+            "1792": 428612096.0,
+            "1793": 428612096.0,
+            "1794": 428612096.0,
+            "1795": 428612096.0,
+            "1796": 428612096.0,
+            "1797": 428612096.0,
+            "1798": 428612096.0,
+            "1799": 428612096.0,
+            "1800": 428612096.0,
+            "1801": 428612096.0,
+            "1802": 428612096.0,
+            "1803": 428612096.0,
+            "1804": 428612096.0,
+            "1805": 428612096.0,
+            "1806": 428612096.0,
+            "1807": 428612096.0,
+            "1808": 428612096.0,
+            "1809": 428612096.0,
+            "1810": 428612096.0,
+            "1811": 428612096.0,
+            "1812": 428612096.0,
+            "1813": 428612096.0,
+            "1814": 428612096.0,
+            "1815": 428612096.0,
+            "1816": 428612096.0,
+            "1817": 428612096.0,
+            "1818": 428612096.0,
+            "1819": 428612096.0,
+            "1820": 428612096.0,
+            "1821": 428612096.0,
+            "1822": 428612096.0,
+            "1823": 428612096.0,
+            "1824": 428612096.0,
+            "1825": 428612096.0,
+            "1826": 428612096.0,
+            "1827": 428612096.0,
+            "1828": 428612096.0,
+            "1829": 428612096.0,
+            "1830": 428612096.0,
+            "1831": 428612096.0,
+            "1832": 428612096.0,
+            "1833": 428612096.0,
+            "1834": 428612096.0,
+            "1835": 428612096.0,
+            "1836": 428612096.0,
+            "1837": 428612096.0,
+            "1838": 428612096.0,
+            "1839": 428612096.0,
+            "1840": 428612096.0,
+            "1841": 428612096.0,
+            "1842": 428612096.0,
+            "1843": 428612096.0,
+            "1844": 428612096.0,
+            "1845": 428612096.0,
+            "1846": 428612096.0,
+            "1847": 428612096.0,
+            "1848": 428612096.0,
+            "1849": 428612096.0,
+            "1850": 428612096.0,
+            "1851": 428612096.0,
+            "1852": 428612096.0,
+            "1853": 428612096.0,
+            "1854": 428612096.0,
+            "1855": 428612096.0,
+            "1856": 428612096.0,
+            "1857": 428612096.0,
+            "1858": 428612096.0,
+            "1859": 428612096.0,
+            "1860": 428612096.0,
+            "1861": 428612096.0,
+            "1862": 428612096.0,
+            "1863": 428612096.0,
+            "1864": 428612096.0,
+            "1865": 428612096.0,
+            "1866": 428612096.0,
+            "1867": 428612096.0,
+            "1868": 428612096.0,
+            "1869": 428612096.0,
+            "1870": 428612096.0,
+            "1871": 428612096.0,
+            "1872": 428612096.0,
+            "1873": 428612096.0,
+            "1874": 428612096.0,
+            "1875": 428612096.0,
+            "1876": 428612096.0,
+            "1877": 428612096.0,
+            "1878": 428612096.0,
+            "1879": 428612096.0,
+            "1880": 428612096.0,
+            "1881": 428612096.0,
+            "1882": 428612096.0,
+            "1883": 428612096.0,
+            "1884": 428612096.0,
+            "1885": 428612096.0,
+            "1886": 428612096.0,
+            "1887": 428612096.0,
+            "1888": 428612096.0,
+            "1889": 428612096.0,
+            "1890": 428612096.0,
+            "1891": 428612096.0,
+            "1892": 428612096.0,
+            "1893": 428612096.0,
+            "1894": 428612096.0,
+            "1895": 428612096.0,
+            "1896": 428612096.0,
+            "1897": 428612096.0,
+            "1898": 428612096.0,
+            "1899": 428612096.0,
+            "1900": 428612096.0,
+            "1901": 428612096.0,
+            "1902": 428612096.0,
+            "1903": 428612096.0,
+            "1904": 428612096.0,
+            "1905": 428612096.0,
+            "1906": 428612096.0,
+            "1907": 428612096.0,
+            "1908": 428612096.0,
+            "1909": 428612096.0,
+            "1910": 428612096.0,
+            "1911": 428612096.0,
+            "1912": 428612096.0,
+            "1913": 428612096.0,
+            "1914": 428612096.0,
+            "1915": 428612096.0,
+            "1916": 428612096.0,
+            "1917": 428612096.0,
+            "1918": 428612096.0,
+            "1919": 428612096.0,
+            "1920": 428612096.0,
+            "1921": 428612096.0,
+            "1922": 428612096.0,
+            "1923": 428612096.0,
+            "1924": 428612096.0,
+            "1925": 428612096.0,
+            "1926": 428612096.0,
+            "1927": 428612096.0,
+            "1928": 428612096.0,
+            "1929": 428612096.0,
+            "1930": 428612096.0,
+            "1931": 428612096.0,
+            "1932": 428612096.0,
+            "1933": 428612096.0,
+            "1934": 428612096.0,
+            "1935": 428612096.0,
+            "1936": 428612096.0,
+            "1937": 428612096.0,
+            "1938": 428612096.0,
+            "1939": 428612096.0,
+            "1940": 428612096.0,
+            "1941": 428612096.0,
+            "1942": 428612096.0,
+            "1943": 428612096.0,
+            "1944": 428612096.0,
+            "1945": 428612096.0,
+            "1946": 428612096.0,
+            "1947": 428612096.0,
+            "1948": 428612096.0,
+            "1949": 428612096.0,
+            "1950": 428612096.0,
+            "1951": 428612096.0,
+            "1952": 428612096.0,
+            "1953": 428612096.0,
+            "1954": 428612096.0,
+            "1955": 428612096.0,
+            "1956": 428612096.0,
+            "1957": 428612096.0,
+            "1958": 428612096.0,
+            "1959": 428612096.0,
+            "1960": 428612096.0,
+            "1961": 428612096.0,
+            "1962": 428612096.0,
+            "1963": 428612096.0,
+            "1964": 428612096.0,
+            "1965": 428612096.0,
+            "1966": 428612096.0,
+            "1967": 428612096.0,
+            "1968": 428612096.0,
+            "1969": 428612096.0,
+            "1970": 428612096.0,
+            "1971": 428612096.0,
+            "1972": 428612096.0,
+            "1973": 428612096.0,
+            "1974": 428612096.0,
+            "1975": 428612096.0,
+            "1976": 428612096.0,
+            "1977": 428612096.0,
+            "1978": 428612096.0,
+            "1979": 428612096.0,
+            "1980": 428612096.0,
+            "1981": 428612096.0,
+            "1982": 428612096.0,
+            "1983": 428612096.0,
+            "1984": 428612096.0,
+            "1985": 428612096.0,
+            "1986": 428612096.0,
+            "1987": 428612096.0,
+            "1988": 428612096.0,
+            "1989": 428612096.0,
+            "1990": 428612096.0,
+            "1991": 428612096.0,
+            "1992": 428612096.0,
+            "1993": 428612096.0,
+            "1994": 428612096.0,
+            "1995": 428612096.0,
+            "1996": 428612096.0,
+            "1997": 428612096.0,
+            "1998": 428612096.0,
+            "1999": 428612096.0,
+            "2000": 428612096.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 2000,
+        "step_interval": 1,
+        "values": {
+            "1": 22.43653,
+            "2": 5.05,
+            "3": 4.99632,
+            "4": 5.00941,
+            "5": 5.30047,
+            "6": 5.00529,
+            "7": 4.98693,
+            "8": 5.03236,
+            "9": 5.04733,
+            "10": 5.0355,
+            "11": 5.05504,
+            "12": 5.02789,
+            "13": 5.05026,
+            "14": 5.03817,
+            "15": 5.03065,
+            "16": 5.04414,
+            "17": 5.00251,
+            "18": 4.9928,
+            "19": 4.99792,
+            "20": 4.99648,
+            "21": 5.01668,
+            "22": 4.97973,
+            "23": 5.06379,
+            "24": 5.01631,
+            "25": 4.96187,
+            "26": 4.95004,
+            "27": 4.95649,
+            "28": 4.93702,
+            "29": 4.93675,
+            "30": 4.92101,
+            "31": 4.93325,
+            "32": 4.92626,
+            "33": 4.93256,
+            "34": 4.93518,
+            "35": 4.95011,
+            "36": 4.959,
+            "37": 5.41549,
+            "38": 5.7108,
+            "39": 4.96475,
+            "40": 4.95756,
+            "41": 5.03533,
+            "42": 4.94591,
+            "43": 5.30856,
+            "44": 4.93166,
+            "45": 5.29533,
+            "46": 6.02838,
+            "47": 4.99271,
+            "48": 4.93548,
+            "49": 4.93262,
+            "50": 4.93589,
+            "51": 4.93457,
+            "52": 4.9402,
+            "53": 4.93593,
+            "54": 4.93266,
+            "55": 4.93457,
+            "56": 4.926,
+            "57": 4.94015,
+            "58": 4.93606,
+            "59": 4.92819,
+            "60": 4.92679,
+            "61": 4.92853,
+            "62": 4.93744,
+            "63": 4.93014,
+            "64": 4.92895,
+            "65": 4.92774,
+            "66": 4.9263,
+            "67": 4.92483,
+            "68": 4.91654,
+            "69": 4.95386,
+            "70": 4.95969,
+            "71": 4.97371,
+            "72": 4.96736,
+            "73": 4.98575,
+            "74": 4.968,
+            "75": 5.68071,
+            "76": 4.98487,
+            "77": 4.98651,
+            "78": 4.97441,
+            "79": 4.97854,
+            "80": 4.97886,
+            "81": 4.98163,
+            "82": 4.97647,
+            "83": 5.33849,
+            "84": 4.98394,
+            "85": 4.98,
+            "86": 4.96888,
+            "87": 4.9685,
+            "88": 5.33167,
+            "89": 5.40565,
+            "90": 4.97724,
+            "91": 6.05451,
+            "92": 4.9699,
+            "93": 4.96947,
+            "94": 4.97853,
+            "95": 5.03234,
+            "96": 4.9703,
+            "97": 4.9766,
+            "98": 4.96386,
+            "99": 4.97968,
+            "100": 4.96583,
+            "101": 4.956,
+            "102": 4.94425,
+            "103": 4.96789,
+            "104": 4.96252,
+            "105": 4.97853,
+            "106": 4.98313,
+            "107": 4.98,
+            "108": 4.97528,
+            "109": 4.98226,
+            "110": 4.98532,
+            "111": 4.95791,
+            "112": 4.95409,
+            "113": 5.66529,
+            "114": 4.96347,
+            "115": 4.99625,
+            "116": 4.99199,
+            "117": 4.98823,
+            "118": 4.98114,
+            "119": 4.97652,
+            "120": 4.98449,
+            "121": 4.98578,
+            "122": 4.98423,
+            "123": 4.9824,
+            "124": 4.98111,
+            "125": 4.98291,
+            "126": 4.98215,
+            "127": 4.98484,
+            "128": 5.35151,
+            "129": 4.9912,
+            "130": 4.99188,
+            "131": 4.98662,
+            "132": 5.34041,
+            "133": 4.98063,
+            "134": 5.33235,
+            "135": 5.69907,
+            "136": 5.33587,
+            "137": 4.98509,
+            "138": 5.61624,
+            "139": 5.23864,
+            "140": 4.92839,
+            "141": 4.95868,
+            "142": 4.93611,
+            "143": 4.9473,
+            "144": 4.9282,
+            "145": 4.93563,
+            "146": 4.92822,
+            "147": 4.94205,
+            "148": 4.94037,
+            "149": 4.93429,
+            "150": 5.62642,
+            "151": 4.93794,
+            "152": 4.9323,
+            "153": 4.93391,
+            "154": 4.93581,
+            "155": 4.93177,
+            "156": 4.93719,
+            "157": 4.93775,
+            "158": 4.93223,
+            "159": 4.9449,
+            "160": 4.93898,
+            "161": 4.94198,
+            "162": 4.9436,
+            "163": 4.9355,
+            "164": 4.93432,
+            "165": 4.9382,
+            "166": 4.94332,
+            "167": 4.93425,
+            "168": 4.93189,
+            "169": 4.92717,
+            "170": 4.94393,
+            "171": 4.94517,
+            "172": 4.92976,
+            "173": 5.303,
+            "174": 4.92818,
+            "175": 4.92924,
+            "176": 4.9385,
+            "177": 5.27801,
+            "178": 4.93182,
+            "179": 5.28092,
+            "180": 5.99722,
+            "181": 4.92656,
+            "182": 4.92594,
+            "183": 4.92947,
+            "184": 4.93087,
+            "185": 4.92967,
+            "186": 4.93088,
+            "187": 5.62908,
+            "188": 4.93498,
+            "189": 4.9476,
+            "190": 4.93843,
+            "191": 4.94101,
+            "192": 4.93265,
+            "193": 4.93046,
+            "194": 4.93133,
+            "195": 4.94044,
+            "196": 4.93997,
+            "197": 4.93336,
+            "198": 6.32096,
+            "199": 4.95042,
+            "200": 4.91888,
+            "201": 4.91803,
+            "202": 4.92212,
+            "203": 4.91738,
+            "204": 4.93431,
+            "205": 4.93078,
+            "206": 4.9288,
+            "207": 4.9431,
+            "208": 4.93288,
+            "209": 4.93152,
+            "210": 4.92297,
+            "211": 4.92152,
+            "212": 4.92078,
+            "213": 4.93382,
+            "214": 4.92203,
+            "215": 4.92628,
+            "216": 4.92759,
+            "217": 4.91972,
+            "218": 4.93018,
+            "219": 5.30587,
+            "220": 4.92639,
+            "221": 4.92815,
+            "222": 5.28345,
+            "223": 4.93513,
+            "224": 5.62954,
+            "225": 6.35198,
+            "226": 4.94108,
+            "227": 4.94033,
+            "228": 4.94077,
+            "229": 4.9445,
+            "230": 4.95277,
+            "231": 4.93684,
+            "232": 4.94258,
+            "233": 4.9386,
+            "234": 4.94149,
+            "235": 4.94872,
+            "236": 4.95361,
+            "237": 4.94924,
+            "238": 4.93722,
+            "239": 4.94342,
+            "240": 4.95029,
+            "241": 4.94512,
+            "242": 4.9423,
+            "243": 4.93861,
+            "244": 4.93578,
+            "245": 4.93502,
+            "246": 4.94519,
+            "247": 4.93658,
+            "248": 4.93761,
+            "249": 4.94583,
+            "250": 4.94414,
+            "251": 4.94331,
+            "252": 4.94044,
+            "253": 4.94317,
+            "254": 4.94161,
+            "255": 4.95295,
+            "256": 4.95044,
+            "257": 4.94816,
+            "258": 4.94006,
+            "259": 4.94409,
+            "260": 4.9408,
+            "261": 4.94791,
+            "262": 5.63079,
+            "263": 4.95361,
+            "264": 5.3219,
+            "265": 4.96046,
+            "266": 4.95564,
+            "267": 5.30372,
+            "268": 5.30618,
+            "269": 4.94954,
+            "270": 6.01622,
+            "271": 4.9509,
+            "272": 4.9579,
+            "273": 4.9529,
+            "274": 4.95339,
+            "275": 4.94721,
+            "276": 4.95053,
+            "277": 4.9434,
+            "278": 4.9389,
+            "279": 4.94021,
+            "280": 4.93862,
+            "281": 4.93834,
+            "282": 4.93985,
+            "283": 4.94183,
+            "284": 4.93716,
+            "285": 4.9443,
+            "286": 4.94305,
+            "287": 4.93467,
+            "288": 4.93816,
+            "289": 4.93749,
+            "290": 4.9349,
+            "291": 4.939,
+            "292": 4.93482,
+            "293": 4.94665,
+            "294": 4.93648,
+            "295": 4.93823,
+            "296": 4.93522,
+            "297": 4.93472,
+            "298": 4.93288,
+            "299": 5.61551,
+            "300": 4.95418,
+            "301": 4.95347,
+            "302": 4.95005,
+            "303": 4.95224,
+            "304": 5.01672,
+            "305": 4.94451,
+            "306": 4.9469,
+            "307": 4.94674,
+            "308": 4.95506,
+            "309": 5.3147,
+            "310": 4.97913,
+            "311": 5.29357,
+            "312": 4.94239,
+            "313": 5.28356,
+            "314": 5.66502,
+            "315": 5.29945,
+            "316": 4.94213,
+            "317": 4.93439,
+            "318": 4.94085,
+            "319": 4.93452,
+            "320": 4.94083,
+            "321": 4.93407,
+            "322": 4.93596,
+            "323": 4.9411,
+            "324": 4.94091,
+            "325": 4.93723,
+            "326": 4.93682,
+            "327": 4.93712,
+            "328": 4.99643,
+            "329": 4.94011,
+            "330": 4.93777,
+            "331": 4.93553,
+            "332": 4.938,
+            "333": 4.94101,
+            "334": 4.93199,
+            "335": 4.93179,
+            "336": 5.28612,
+            "337": 5.30266,
+            "338": 4.96477,
+            "339": 4.97585,
+            "340": 4.95959,
+            "341": 4.95912,
+            "342": 4.96594,
+            "343": 4.96105,
+            "344": 4.96501,
+            "345": 4.96175,
+            "346": 4.96452,
+            "347": 4.9603,
+            "348": 4.95434,
+            "349": 4.95658,
+            "350": 4.95773,
+            "351": 4.96723,
+            "352": 5.02353,
+            "353": 4.95487,
+            "354": 5.32227,
+            "355": 4.95601,
+            "356": 5.29598,
+            "357": 4.95819,
+            "358": 5.29935,
+            "359": 6.01593,
+            "360": 4.96832,
+            "361": 4.95302,
+            "362": 4.95944,
+            "363": 4.95167,
+            "364": 4.9483,
+            "365": 4.94951,
+            "366": 4.9525,
+            "367": 4.95364,
+            "368": 4.94948,
+            "369": 4.95258,
+            "370": 4.94974,
+            "371": 4.96357,
+            "372": 4.94701,
+            "373": 4.94584,
+            "374": 5.27688,
+            "375": 5.29329,
+            "376": 4.93553,
+            "377": 4.93296,
+            "378": 4.93431,
+            "379": 4.94158,
+            "380": 4.98441,
+            "381": 4.99657,
+            "382": 4.97634,
+            "383": 4.98015,
+            "384": 4.98178,
+            "385": 4.97595,
+            "386": 4.97431,
+            "387": 4.97965,
+            "388": 4.91884,
+            "389": 4.92436,
+            "390": 4.9179,
+            "391": 4.91999,
+            "392": 4.92113,
+            "393": 4.92231,
+            "394": 4.91815,
+            "395": 4.92381,
+            "396": 4.91848,
+            "397": 4.92412,
+            "398": 4.91541,
+            "399": 4.91455,
+            "400": 5.29982,
+            "401": 5.26416,
+            "402": 5.2612,
+            "403": 4.91795,
+            "404": 5.63316,
+            "405": 5.27153,
+            "406": 4.90744,
+            "407": 4.9142,
+            "408": 4.90831,
+            "409": 4.90838,
+            "410": 4.92063,
+            "411": 5.25377,
+            "412": 5.26322,
+            "413": 4.91895,
+            "414": 4.92378,
+            "415": 4.91866,
+            "416": 4.91955,
+            "417": 4.92152,
+            "418": 4.91929,
+            "419": 4.9201,
+            "420": 4.91526,
+            "421": 4.91974,
+            "422": 4.92503,
+            "423": 4.92579,
+            "424": 4.91791,
+            "425": 4.92253,
+            "426": 4.92114,
+            "427": 4.91774,
+            "428": 4.91171,
+            "429": 4.9125,
+            "430": 4.91411,
+            "431": 4.90802,
+            "432": 4.9164,
+            "433": 4.90723,
+            "434": 4.92382,
+            "435": 4.9069,
+            "436": 4.91154,
+            "437": 4.90512,
+            "438": 4.9175,
+            "439": 4.91782,
+            "440": 4.91028,
+            "441": 4.91048,
+            "442": 4.90894,
+            "443": 4.88817,
+            "444": 4.88126,
+            "445": 5.24853,
+            "446": 4.87836,
+            "447": 5.24263,
+            "448": 5.25398,
+            "449": 6.28763,
+            "450": 4.88338,
+            "451": 4.89491,
+            "452": 4.88709,
+            "453": 4.89008,
+            "454": 4.90322,
+            "455": 4.90113,
+            "456": 4.90439,
+            "457": 4.90223,
+            "458": 4.90641,
+            "459": 4.90851,
+            "460": 4.9009,
+            "461": 4.89968,
+            "462": 4.89662,
+            "463": 4.9081,
+            "464": 4.88866,
+            "465": 4.90253,
+            "466": 4.90724,
+            "467": 4.89875,
+            "468": 4.90067,
+            "469": 4.90495,
+            "470": 4.89887,
+            "471": 4.89965,
+            "472": 4.90145,
+            "473": 4.88549,
+            "474": 4.87833,
+            "475": 4.88274,
+            "476": 4.87937,
+            "477": 4.88019,
+            "478": 4.87808,
+            "479": 4.88269,
+            "480": 4.87591,
+            "481": 4.88072,
+            "482": 4.87452,
+            "483": 4.8839,
+            "484": 4.87834,
+            "485": 5.21963,
+            "486": 4.8887,
+            "487": 5.22473,
+            "488": 4.88748,
+            "489": 4.89663,
+            "490": 5.6108,
+            "491": 5.24875,
+            "492": 4.88583,
+            "493": 5.24488,
+            "494": 5.59516,
+            "495": 4.89058,
+            "496": 4.91601,
+            "497": 4.88752,
+            "498": 4.88645,
+            "499": 4.89008,
+            "500": 4.89271,
+            "501": 4.8913,
+            "502": 4.89039,
+            "503": 4.8906,
+            "504": 4.88603,
+            "505": 4.92691,
+            "506": 4.91793,
+            "507": 4.92158,
+            "508": 4.91981,
+            "509": 4.92795,
+            "510": 4.91413,
+            "511": 4.91073,
+            "512": 4.90909,
+            "513": 4.91434,
+            "514": 4.91509,
+            "515": 4.91002,
+            "516": 4.9115,
+            "517": 4.91722,
+            "518": 4.91514,
+            "519": 4.91283,
+            "520": 4.91403,
+            "521": 4.91077,
+            "522": 4.91167,
+            "523": 5.26088,
+            "524": 5.27803,
+            "525": 4.92516,
+            "526": 4.93143,
+            "527": 4.9217,
+            "528": 4.92344,
+            "529": 4.91786,
+            "530": 4.9193,
+            "531": 4.881,
+            "532": 4.87697,
+            "533": 4.88329,
+            "534": 5.23628,
+            "535": 5.26149,
+            "536": 4.88132,
+            "537": 5.23366,
+            "538": 5.92272,
+            "539": 4.8822,
+            "540": 4.87645,
+            "541": 4.87941,
+            "542": 4.8726,
+            "543": 4.87977,
+            "544": 4.88572,
+            "545": 4.97915,
+            "546": 4.94014,
+            "547": 4.9447,
+            "548": 4.94585,
+            "549": 4.93712,
+            "550": 4.95428,
+            "551": 4.9405,
+            "552": 4.94013,
+            "553": 4.94514,
+            "554": 4.94542,
+            "555": 4.94729,
+            "556": 4.93818,
+            "557": 4.94632,
+            "558": 4.95928,
+            "559": 4.94439,
+            "560": 5.29538,
+            "561": 5.29912,
+            "562": 4.95591,
+            "563": 4.94545,
+            "564": 4.9589,
+            "565": 4.9486,
+            "566": 4.94487,
+            "567": 4.94563,
+            "568": 4.96795,
+            "569": 4.96332,
+            "570": 4.95731,
+            "571": 4.95751,
+            "572": 4.94401,
+            "573": 4.94623,
+            "574": 4.9438,
+            "575": 4.9342,
+            "576": 4.93847,
+            "577": 4.94215,
+            "578": 4.94036,
+            "579": 4.95135,
+            "580": 5.28996,
+            "581": 5.66625,
+            "582": 4.93892,
+            "583": 5.64719,
+            "584": 5.28091,
+            "585": 4.95827,
+            "586": 4.95725,
+            "587": 4.96107,
+            "588": 4.95092,
+            "589": 4.95514,
+            "590": 4.94845,
+            "591": 4.94342,
+            "592": 4.9488,
+            "593": 4.93576,
+            "594": 4.93657,
+            "595": 4.93545,
+            "596": 4.93595,
+            "597": 5.29319,
+            "598": 5.28921,
+            "599": 4.95347,
+            "600": 4.94896,
+            "601": 4.94543,
+            "602": 4.95405,
+            "603": 4.94996,
+            "604": 4.94726,
+            "605": 4.94394,
+            "606": 4.9443,
+            "607": 4.99448,
+            "608": 4.93032,
+            "609": 4.96191,
+            "610": 4.95086,
+            "611": 4.94486,
+            "612": 4.94403,
+            "613": 4.94194,
+            "614": 4.94624,
+            "615": 4.94461,
+            "616": 4.96458,
+            "617": 4.94658,
+            "618": 4.94254,
+            "619": 4.93901,
+            "620": 4.94138,
+            "621": 4.94747,
+            "622": 4.95796,
+            "623": 4.94579,
+            "624": 5.30372,
+            "625": 4.94082,
+            "626": 5.66834,
+            "627": 4.93994,
+            "628": 5.97473,
+            "629": 4.94152,
+            "630": 4.94328,
+            "631": 4.9385,
+            "632": 4.9688,
+            "633": 4.93837,
+            "634": 5.25732,
+            "635": 4.9147,
+            "636": 5.25839,
+            "637": 4.92259,
+            "638": 4.91081,
+            "639": 4.92229,
+            "640": 4.92687,
+            "641": 4.91335,
+            "642": 4.91557,
+            "643": 4.91922,
+            "644": 4.91847,
+            "645": 4.92121,
+            "646": 4.92251,
+            "647": 4.91255,
+            "648": 4.91291,
+            "649": 4.91003,
+            "650": 4.90867,
+            "651": 4.91235,
+            "652": 4.90719,
+            "653": 4.90865,
+            "654": 4.90719,
+            "655": 4.91306,
+            "656": 4.90861,
+            "657": 4.90901,
+            "658": 4.91095,
+            "659": 4.90726,
+            "660": 4.90915,
+            "661": 4.91011,
+            "662": 4.90721,
+            "663": 4.90907,
+            "664": 4.91699,
+            "665": 4.91095,
+            "666": 4.90826,
+            "667": 4.90687,
+            "668": 4.90738,
+            "669": 5.25716,
+            "670": 5.25453,
+            "671": 5.28603,
+            "672": 5.25386,
+            "673": 6.29304,
+            "674": 4.91719,
+            "675": 4.9174,
+            "676": 4.92014,
+            "677": 4.92048,
+            "678": 4.90878,
+            "679": 4.90967,
+            "680": 4.90981,
+            "681": 4.91054,
+            "682": 4.90885,
+            "683": 4.90932,
+            "684": 4.915,
+            "685": 4.90701,
+            "686": 4.91124,
+            "687": 4.91733,
+            "688": 4.91577,
+            "689": 4.91189,
+            "690": 4.90854,
+            "691": 4.90631,
+            "692": 4.90689,
+            "693": 4.9142,
+            "694": 4.90933,
+            "695": 4.90064,
+            "696": 4.88962,
+            "697": 4.89317,
+            "698": 4.89665,
+            "699": 4.90473,
+            "700": 4.90675,
+            "701": 4.90072,
+            "702": 4.90347,
+            "703": 4.90535,
+            "704": 4.90243,
+            "705": 4.90653,
+            "706": 4.90494,
+            "707": 4.90715,
+            "708": 4.89971,
+            "709": 5.25068,
+            "710": 5.24447,
+            "711": 4.91173,
+            "712": 4.91607,
+            "713": 5.26011,
+            "714": 4.90966,
+            "715": 4.90512,
+            "716": 5.63181,
+            "717": 5.62011,
+            "718": 5.23301,
+            "719": 4.91317,
+            "720": 4.90779,
+            "721": 4.90675,
+            "722": 4.90612,
+            "723": 4.90554,
+            "724": 4.90952,
+            "725": 4.90669,
+            "726": 4.90589,
+            "727": 4.9062,
+            "728": 4.91028,
+            "729": 4.905,
+            "730": 4.90848,
+            "731": 4.90621,
+            "732": 4.91216,
+            "733": 4.90248,
+            "734": 4.90051,
+            "735": 4.90319,
+            "736": 4.90401,
+            "737": 4.90646,
+            "738": 4.90558,
+            "739": 4.90438,
+            "740": 4.90694,
+            "741": 4.9036,
+            "742": 4.90521,
+            "743": 4.90326,
+            "744": 4.90534,
+            "745": 4.90658,
+            "746": 5.24876,
+            "747": 4.91293,
+            "748": 5.24944,
+            "749": 4.90712,
+            "750": 4.90572,
+            "751": 4.90977,
+            "752": 4.90683,
+            "753": 4.90815,
+            "754": 4.90611,
+            "755": 4.91427,
+            "756": 4.9129,
+            "757": 4.91264,
+            "758": 5.25755,
+            "759": 4.91199,
+            "760": 5.2647,
+            "761": 4.91559,
+            "762": 5.64712,
+            "763": 5.59149,
+            "764": 4.91566,
+            "765": 4.91348,
+            "766": 4.92052,
+            "767": 4.9149,
+            "768": 4.91624,
+            "769": 4.90919,
+            "770": 4.9208,
+            "771": 4.9111,
+            "772": 4.91242,
+            "773": 4.91183,
+            "774": 4.91856,
+            "775": 4.91524,
+            "776": 4.91642,
+            "777": 4.91271,
+            "778": 4.91587,
+            "779": 4.91173,
+            "780": 4.9163,
+            "781": 4.9101,
+            "782": 4.90927,
+            "783": 4.91594,
+            "784": 5.27562,
+            "785": 5.29399,
+            "786": 4.92064,
+            "787": 4.92508,
+            "788": 4.91936,
+            "789": 4.92025,
+            "790": 4.92839,
+            "791": 4.91829,
+            "792": 4.9234,
+            "793": 4.92615,
+            "794": 4.91968,
+            "795": 4.91417,
+            "796": 4.89214,
+            "797": 4.87642,
+            "798": 4.87726,
+            "799": 4.88691,
+            "800": 4.87753,
+            "801": 4.90361,
+            "802": 4.91538,
+            "803": 5.25822,
+            "804": 5.25769,
+            "805": 4.90985,
+            "806": 4.91228,
+            "807": 5.6423,
+            "808": 5.23836,
+            "809": 4.9314,
+            "810": 4.91226,
+            "811": 4.91382,
+            "812": 4.91588,
+            "813": 4.91005,
+            "814": 4.9202,
+            "815": 4.90766,
+            "816": 4.90744,
+            "817": 4.91497,
+            "818": 4.91,
+            "819": 4.90572,
+            "820": 4.91342,
+            "821": 5.26215,
+            "822": 5.25971,
+            "823": 4.92486,
+            "824": 4.92645,
+            "825": 4.91518,
+            "826": 4.91893,
+            "827": 4.90862,
+            "828": 4.9143,
+            "829": 4.91422,
+            "830": 4.91829,
+            "831": 4.90569,
+            "832": 4.91122,
+            "833": 4.90584,
+            "834": 4.90518,
+            "835": 4.90755,
+            "836": 4.90656,
+            "837": 4.90626,
+            "838": 4.90987,
+            "839": 4.91189,
+            "840": 4.90735,
+            "841": 4.90697,
+            "842": 4.91064,
+            "843": 4.90409,
+            "844": 4.90711,
+            "845": 4.90385,
+            "846": 4.90599,
+            "847": 5.24636,
+            "848": 4.89752,
+            "849": 5.24655,
+            "850": 4.90148,
+            "851": 4.89501,
+            "852": 5.98483,
+            "853": 4.89468,
+            "854": 4.89653,
+            "855": 4.8954,
+            "856": 4.89811,
+            "857": 4.90026,
+            "858": 5.24069,
+            "859": 4.91345,
+            "860": 5.2538,
+            "861": 4.91107,
+            "862": 4.90905,
+            "863": 4.90289,
+            "864": 4.90179,
+            "865": 4.90697,
+            "866": 4.89969,
+            "867": 4.89622,
+            "868": 4.89817,
+            "869": 4.89734,
+            "870": 4.89421,
+            "871": 4.902,
+            "872": 4.89737,
+            "873": 4.90082,
+            "874": 4.8986,
+            "875": 4.9034,
+            "876": 4.90213,
+            "877": 4.89969,
+            "878": 4.90652,
+            "879": 4.90216,
+            "880": 4.90541,
+            "881": 4.90491,
+            "882": 4.89798,
+            "883": 4.89325,
+            "884": 4.89662,
+            "885": 4.91,
+            "886": 4.89481,
+            "887": 4.90025,
+            "888": 4.89887,
+            "889": 4.89458,
+            "890": 4.89351,
+            "891": 4.89343,
+            "892": 5.24625,
+            "893": 4.90075,
+            "894": 5.24719,
+            "895": 4.89439,
+            "896": 5.95508,
+            "897": 5.92842,
+            "898": 4.90126,
+            "899": 4.91443,
+            "900": 4.90222,
+            "901": 4.89928,
+            "902": 4.89952,
+            "903": 4.89905,
+            "904": 4.90536,
+            "905": 4.90627,
+            "906": 4.90188,
+            "907": 4.90671,
+            "908": 4.90531,
+            "909": 4.90614,
+            "910": 4.90319,
+            "911": 4.90668,
+            "912": 4.90614,
+            "913": 4.90641,
+            "914": 4.90219,
+            "915": 4.89858,
+            "916": 4.89788,
+            "917": 4.90114,
+            "918": 4.89062,
+            "919": 4.89675,
+            "920": 4.89412,
+            "921": 4.89851,
+            "922": 4.90258,
+            "923": 4.89837,
+            "924": 4.89168,
+            "925": 4.90558,
+            "926": 4.88926,
+            "927": 4.89631,
+            "928": 4.89481,
+            "929": 4.89896,
+            "930": 4.90349,
+            "931": 4.90254,
+            "932": 4.89424,
+            "933": 5.2393,
+            "934": 4.90447,
+            "935": 5.24957,
+            "936": 4.89799,
+            "937": 5.24757,
+            "938": 4.90497,
+            "939": 5.26023,
+            "940": 4.905,
+            "941": 4.90603,
+            "942": 5.89013,
+            "943": 5.2754,
+            "944": 4.89903,
+            "945": 4.90825,
+            "946": 4.90072,
+            "947": 4.91095,
+            "948": 4.89642,
+            "949": 4.90314,
+            "950": 4.9027,
+            "951": 4.90276,
+            "952": 4.90005,
+            "953": 4.90591,
+            "954": 4.89179,
+            "955": 4.89648,
+            "956": 4.89739,
+            "957": 4.90258,
+            "958": 4.90027,
+            "959": 4.90627,
+            "960": 4.89592,
+            "961": 4.89153,
+            "962": 4.89826,
+            "963": 4.89281,
+            "964": 4.88656,
+            "965": 4.9056,
+            "966": 4.88948,
+            "967": 4.89075,
+            "968": 4.89128,
+            "969": 4.88907,
+            "970": 5.23384,
+            "971": 4.91197,
+            "972": 5.24458,
+            "973": 4.90766,
+            "974": 4.90557,
+            "975": 4.9059,
+            "976": 4.90502,
+            "977": 4.90392,
+            "978": 4.90541,
+            "979": 4.89927,
+            "980": 4.9047,
+            "981": 4.90276,
+            "982": 5.2516,
+            "983": 5.25121,
+            "984": 4.90232,
+            "985": 4.90209,
+            "986": 5.26939,
+            "987": 5.52932,
+            "988": 5.28293,
+            "989": 4.91742,
+            "990": 4.90637,
+            "991": 4.90953,
+            "992": 4.90864,
+            "993": 4.9075,
+            "994": 4.90696,
+            "995": 4.90473,
+            "996": 4.90192,
+            "997": 4.90199,
+            "998": 4.89181,
+            "999": 4.89111,
+            "1000": 4.89025,
+            "1001": 4.9168,
+            "1002": 4.90983,
+            "1003": 4.91875,
+            "1004": 4.90892,
+            "1005": 4.92588,
+            "1006": 4.91678,
+            "1007": 5.262,
+            "1008": 4.92447,
+            "1009": 5.26729,
+            "1010": 4.92803,
+            "1011": 4.92461,
+            "1012": 4.92338,
+            "1013": 4.9218,
+            "1014": 4.92051,
+            "1015": 4.92442,
+            "1016": 4.91248,
+            "1017": 4.92113,
+            "1018": 4.92046,
+            "1019": 4.91949,
+            "1020": 4.92623,
+            "1021": 4.92267,
+            "1022": 4.92249,
+            "1023": 4.91899,
+            "1024": 4.92062,
+            "1025": 5.26804,
+            "1026": 4.92131,
+            "1027": 5.26954,
+            "1028": 4.91856,
+            "1029": 4.91681,
+            "1030": 5.90813,
+            "1031": 4.92456,
+            "1032": 4.92325,
+            "1033": 5.3083,
+            "1034": 4.91916,
+            "1035": 4.91422,
+            "1036": 4.91293,
+            "1037": 4.91223,
+            "1038": 4.9211,
+            "1039": 4.92393,
+            "1040": 4.92009,
+            "1041": 4.92106,
+            "1042": 4.9242,
+            "1043": 4.92005,
+            "1044": 5.26878,
+            "1045": 4.92668,
+            "1046": 4.93095,
+            "1047": 5.27312,
+            "1048": 4.92622,
+            "1049": 4.92229,
+            "1050": 4.92078,
+            "1051": 4.9252,
+            "1052": 4.92398,
+            "1053": 4.92467,
+            "1054": 4.92254,
+            "1055": 4.92721,
+            "1056": 4.92594,
+            "1057": 4.93074,
+            "1058": 4.9202,
+            "1059": 4.92339,
+            "1060": 4.92936,
+            "1061": 4.92316,
+            "1062": 4.91832,
+            "1063": 4.9324,
+            "1064": 4.96238,
+            "1065": 4.94321,
+            "1066": 4.96241,
+            "1067": 4.93128,
+            "1068": 4.92665,
+            "1069": 4.93217,
+            "1070": 5.29473,
+            "1071": 5.27044,
+            "1072": 4.91774,
+            "1073": 4.92979,
+            "1074": 5.30092,
+            "1075": 5.57166,
+            "1076": 4.9336,
+            "1077": 4.91975,
+            "1078": 5.29838,
+            "1079": 4.92345,
+            "1080": 4.92265,
+            "1081": 4.93832,
+            "1082": 5.28966,
+            "1083": 4.94183,
+            "1084": 5.28091,
+            "1085": 4.94506,
+            "1086": 4.94668,
+            "1087": 4.94028,
+            "1088": 4.93858,
+            "1089": 4.93937,
+            "1090": 4.9454,
+            "1091": 4.95599,
+            "1092": 4.95023,
+            "1093": 4.94499,
+            "1094": 4.96028,
+            "1095": 4.95213,
+            "1096": 4.96406,
+            "1097": 4.93905,
+            "1098": 4.92198,
+            "1099": 4.93824,
+            "1100": 4.92789,
+            "1101": 4.92981,
+            "1102": 4.93937,
+            "1103": 4.91985,
+            "1104": 4.91889,
+            "1105": 4.93785,
+            "1106": 4.94007,
+            "1107": 4.93618,
+            "1108": 4.94002,
+            "1109": 4.96964,
+            "1110": 4.93965,
+            "1111": 4.89692,
+            "1112": 4.89611,
+            "1113": 4.89245,
+            "1114": 5.24194,
+            "1115": 4.89604,
+            "1116": 5.23738,
+            "1117": 4.89591,
+            "1118": 4.89712,
+            "1119": 6.2207,
+            "1120": 4.89707,
+            "1121": 5.24025,
+            "1122": 4.89987,
+            "1123": 5.27914,
+            "1124": 4.9043,
+            "1125": 4.89477,
+            "1126": 4.89625,
+            "1127": 4.90132,
+            "1128": 4.90216,
+            "1129": 4.90398,
+            "1130": 4.89594,
+            "1131": 4.90153,
+            "1132": 4.89796,
+            "1133": 4.89536,
+            "1134": 4.89807,
+            "1135": 4.89858,
+            "1136": 4.89867,
+            "1137": 4.89681,
+            "1138": 4.92931,
+            "1139": 4.92599,
+            "1140": 4.89538,
+            "1141": 4.89732,
+            "1142": 4.89242,
+            "1143": 4.89262,
+            "1144": 4.89274,
+            "1145": 4.93085,
+            "1146": 4.9294,
+            "1147": 4.92891,
+            "1148": 4.91881,
+            "1149": 4.89129,
+            "1150": 4.89171,
+            "1151": 4.8862,
+            "1152": 4.89315,
+            "1153": 4.89463,
+            "1154": 4.89481,
+            "1155": 4.89194,
+            "1156": 5.23303,
+            "1157": 4.89025,
+            "1158": 4.89312,
+            "1159": 5.24533,
+            "1160": 5.25573,
+            "1161": 5.23949,
+            "1162": 4.8914,
+            "1163": 4.89247,
+            "1164": 4.8896,
+            "1165": 5.88618,
+            "1166": 4.91824,
+            "1167": 4.89232,
+            "1168": 5.27914,
+            "1169": 4.88638,
+            "1170": 4.89624,
+            "1171": 4.90097,
+            "1172": 4.89335,
+            "1173": 4.90022,
+            "1174": 4.88823,
+            "1175": 4.91533,
+            "1176": 4.91702,
+            "1177": 4.91026,
+            "1178": 4.89204,
+            "1179": 4.89341,
+            "1180": 4.88754,
+            "1181": 4.89101,
+            "1182": 4.89528,
+            "1183": 4.89482,
+            "1184": 4.88208,
+            "1185": 4.87829,
+            "1186": 4.88501,
+            "1187": 4.88593,
+            "1188": 4.87526,
+            "1189": 4.88604,
+            "1190": 4.90872,
+            "1191": 4.88218,
+            "1192": 4.8826,
+            "1193": 4.88606,
+            "1194": 5.22378,
+            "1195": 4.88192,
+            "1196": 4.8877,
+            "1197": 5.23842,
+            "1198": 4.89888,
+            "1199": 4.89039,
+            "1200": 4.89543,
+            "1201": 4.8917,
+            "1202": 4.88928,
+            "1203": 4.88428,
+            "1204": 4.91394,
+            "1205": 5.27535,
+            "1206": 5.27273,
+            "1207": 4.92919,
+            "1208": 4.92498,
+            "1209": 5.60645,
+            "1210": 5.23108,
+            "1211": 4.91823,
+            "1212": 4.91107,
+            "1213": 4.90706,
+            "1214": 5.33395,
+            "1215": 4.91341,
+            "1216": 4.92296,
+            "1217": 4.92797,
+            "1218": 4.91436,
+            "1219": 4.93183,
+            "1220": 4.92763,
+            "1221": 4.91189,
+            "1222": 4.91524,
+            "1223": 4.92927,
+            "1224": 4.90762,
+            "1225": 4.91646,
+            "1226": 4.95199,
+            "1227": 4.93657,
+            "1228": 4.91049,
+            "1229": 4.90576,
+            "1230": 4.92418,
+            "1231": 5.24788,
+            "1232": 4.90922,
+            "1233": 4.90828,
+            "1234": 5.28741,
+            "1235": 4.93359,
+            "1236": 4.92651,
+            "1237": 4.92759,
+            "1238": 4.91812,
+            "1239": 4.96161,
+            "1240": 4.92462,
+            "1241": 4.9408,
+            "1242": 4.95151,
+            "1243": 4.92866,
+            "1244": 4.94942,
+            "1245": 4.93202,
+            "1246": 4.93118,
+            "1247": 4.92787,
+            "1248": 4.93195,
+            "1249": 5.31148,
+            "1250": 4.96525,
+            "1251": 5.27677,
+            "1252": 4.95992,
+            "1253": 4.89092,
+            "1254": 5.87598,
+            "1255": 4.89013,
+            "1256": 4.89328,
+            "1257": 4.88679,
+            "1258": 4.89107,
+            "1259": 5.26785,
+            "1260": 4.89071,
+            "1261": 4.89005,
+            "1262": 4.89216,
+            "1263": 4.89212,
+            "1264": 4.88574,
+            "1265": 4.88902,
+            "1266": 4.88642,
+            "1267": 4.89574,
+            "1268": 4.88631,
+            "1269": 5.22724,
+            "1270": 4.88943,
+            "1271": 5.23761,
+            "1272": 4.90353,
+            "1273": 4.89726,
+            "1274": 4.92161,
+            "1275": 4.92347,
+            "1276": 4.91698,
+            "1277": 4.92233,
+            "1278": 4.91979,
+            "1279": 4.9211,
+            "1280": 4.9179,
+            "1281": 4.92209,
+            "1282": 4.94485,
+            "1283": 4.92932,
+            "1284": 4.92976,
+            "1285": 4.91788,
+            "1286": 4.93408,
+            "1287": 4.92359,
+            "1288": 4.92166,
+            "1289": 4.9185,
+            "1290": 4.91424,
+            "1291": 4.91891,
+            "1292": 4.92028,
+            "1293": 4.9117,
+            "1294": 5.27044,
+            "1295": 5.29676,
+            "1296": 4.91703,
+            "1297": 4.92056,
+            "1298": 4.92207,
+            "1299": 5.91394,
+            "1300": 4.9147,
+            "1301": 4.9131,
+            "1302": 4.9176,
+            "1303": 4.93425,
+            "1304": 5.304,
+            "1305": 4.91978,
+            "1306": 5.27498,
+            "1307": 4.92043,
+            "1308": 4.91675,
+            "1309": 5.27831,
+            "1310": 4.93667,
+            "1311": 4.93075,
+            "1312": 4.92766,
+            "1313": 4.92554,
+            "1314": 4.93753,
+            "1315": 4.93323,
+            "1316": 4.92326,
+            "1317": 4.92226,
+            "1318": 4.9254,
+            "1319": 4.91683,
+            "1320": 4.91352,
+            "1321": 4.93361,
+            "1322": 4.9202,
+            "1323": 4.92888,
+            "1324": 4.94749,
+            "1325": 4.92427,
+            "1326": 4.91993,
+            "1327": 4.94147,
+            "1328": 4.91569,
+            "1329": 4.9082,
+            "1330": 4.90808,
+            "1331": 4.92463,
+            "1332": 4.94304,
+            "1333": 4.91833,
+            "1334": 4.91915,
+            "1335": 4.9569,
+            "1336": 4.91253,
+            "1337": 4.91228,
+            "1338": 4.91599,
+            "1339": 5.26886,
+            "1340": 4.94108,
+            "1341": 5.28895,
+            "1342": 4.92166,
+            "1343": 4.93148,
+            "1344": 6.20454,
+            "1345": 4.93732,
+            "1346": 4.94109,
+            "1347": 5.28178,
+            "1348": 4.92597,
+            "1349": 5.31528,
+            "1350": 4.93124,
+            "1351": 4.9199,
+            "1352": 4.92145,
+            "1353": 4.91761,
+            "1354": 4.91599,
+            "1355": 4.91867,
+            "1356": 4.92286,
+            "1357": 4.91965,
+            "1358": 4.92454,
+            "1359": 4.92188,
+            "1360": 4.91921,
+            "1361": 4.92021,
+            "1362": 4.92372,
+            "1363": 4.91207,
+            "1364": 4.96107,
+            "1365": 4.91388,
+            "1366": 4.91683,
+            "1367": 4.91413,
+            "1368": 4.91691,
+            "1369": 4.91871,
+            "1370": 4.92278,
+            "1371": 4.92605,
+            "1372": 4.92653,
+            "1373": 4.9264,
+            "1374": 4.92864,
+            "1375": 4.92839,
+            "1376": 4.93185,
+            "1377": 4.92304,
+            "1378": 4.92916,
+            "1379": 4.92701,
+            "1380": 4.92797,
+            "1381": 5.27325,
+            "1382": 4.89544,
+            "1383": 4.89064,
+            "1384": 5.60494,
+            "1385": 5.00482,
+            "1386": 5.33879,
+            "1387": 4.92912,
+            "1388": 4.92575,
+            "1389": 5.83703,
+            "1390": 4.91691,
+            "1391": 4.91717,
+            "1392": 4.92005,
+            "1393": 4.92211,
+            "1394": 4.91895,
+            "1395": 5.29903,
+            "1396": 4.92143,
+            "1397": 4.91551,
+            "1398": 4.91427,
+            "1399": 4.91348,
+            "1400": 4.92556,
+            "1401": 4.92553,
+            "1402": 4.91884,
+            "1403": 4.91856,
+            "1404": 4.95579,
+            "1405": 4.88917,
+            "1406": 4.88886,
+            "1407": 4.90262,
+            "1408": 4.88379,
+            "1409": 4.88976,
+            "1410": 4.88681,
+            "1411": 4.8751,
+            "1412": 4.89308,
+            "1413": 4.89122,
+            "1414": 4.88458,
+            "1415": 4.89489,
+            "1416": 4.88438,
+            "1417": 4.88183,
+            "1418": 5.229,
+            "1419": 4.96736,
+            "1420": 4.95735,
+            "1421": 5.29839,
+            "1422": 4.92896,
+            "1423": 4.9679,
+            "1424": 4.96109,
+            "1425": 4.96048,
+            "1426": 4.95854,
+            "1427": 4.95558,
+            "1428": 4.90503,
+            "1429": 5.24486,
+            "1430": 5.24901,
+            "1431": 4.8987,
+            "1432": 4.89075,
+            "1433": 5.22736,
+            "1434": 5.47175,
+            "1435": 4.89209,
+            "1436": 4.8986,
+            "1437": 4.8891,
+            "1438": 4.88697,
+            "1439": 4.88974,
+            "1440": 5.27298,
+            "1441": 4.89403,
+            "1442": 4.90495,
+            "1443": 4.89585,
+            "1444": 4.89766,
+            "1445": 4.89344,
+            "1446": 4.89618,
+            "1447": 4.88721,
+            "1448": 4.88735,
+            "1449": 4.89401,
+            "1450": 4.89435,
+            "1451": 4.89143,
+            "1452": 4.88553,
+            "1453": 4.89139,
+            "1454": 4.89347,
+            "1455": 5.23147,
+            "1456": 4.8987,
+            "1457": 4.90447,
+            "1458": 4.89553,
+            "1459": 5.23187,
+            "1460": 4.90546,
+            "1461": 4.89293,
+            "1462": 4.89652,
+            "1463": 4.88806,
+            "1464": 4.94852,
+            "1465": 4.89339,
+            "1466": 4.88888,
+            "1467": 4.89409,
+            "1468": 4.89028,
+            "1469": 4.89198,
+            "1470": 4.89499,
+            "1471": 4.89853,
+            "1472": 4.89989,
+            "1473": 5.245,
+            "1474": 4.89244,
+            "1475": 5.24744,
+            "1476": 4.88786,
+            "1477": 4.88954,
+            "1478": 5.81074,
+            "1479": 4.90603,
+            "1480": 4.8817,
+            "1481": 4.88853,
+            "1482": 4.88913,
+            "1483": 4.88525,
+            "1484": 4.88091,
+            "1485": 5.26103,
+            "1486": 4.88332,
+            "1487": 4.88482,
+            "1488": 4.88349,
+            "1489": 4.93535,
+            "1490": 4.93713,
+            "1491": 4.94008,
+            "1492": 4.93273,
+            "1493": 5.26558,
+            "1494": 4.92625,
+            "1495": 4.93119,
+            "1496": 4.93326,
+            "1497": 5.29661,
+            "1498": 4.94651,
+            "1499": 4.94563,
+            "1500": 4.94732,
+            "1501": 4.94956,
+            "1502": 4.93949,
+            "1503": 4.94314,
+            "1504": 4.949,
+            "1505": 4.93848,
+            "1506": 4.93655,
+            "1507": 4.93352,
+            "1508": 4.93376,
+            "1509": 4.93575,
+            "1510": 4.93237,
+            "1511": 4.93325,
+            "1512": 4.93443,
+            "1513": 4.93608,
+            "1514": 4.92875,
+            "1515": 4.93822,
+            "1516": 4.92271,
+            "1517": 4.93602,
+            "1518": 4.93135,
+            "1519": 5.28269,
+            "1520": 5.28601,
+            "1521": 4.93214,
+            "1522": 4.93238,
+            "1523": 4.9331,
+            "1524": 5.84985,
+            "1525": 4.93183,
+            "1526": 4.9312,
+            "1527": 4.94067,
+            "1528": 4.94179,
+            "1529": 4.93283,
+            "1530": 5.64255,
+            "1531": 4.93012,
+            "1532": 4.93237,
+            "1533": 4.93188,
+            "1534": 5.28642,
+            "1535": 4.93295,
+            "1536": 4.93351,
+            "1537": 4.93687,
+            "1538": 4.93395,
+            "1539": 4.93892,
+            "1540": 4.93329,
+            "1541": 4.93178,
+            "1542": 4.94011,
+            "1543": 4.93223,
+            "1544": 4.9238,
+            "1545": 4.93295,
+            "1546": 4.92789,
+            "1547": 4.92723,
+            "1548": 4.93344,
+            "1549": 4.93081,
+            "1550": 4.93484,
+            "1551": 4.93247,
+            "1552": 4.94286,
+            "1553": 4.93871,
+            "1554": 4.9346,
+            "1555": 4.93508,
+            "1556": 4.93254,
+            "1557": 4.93621,
+            "1558": 4.93402,
+            "1559": 4.92552,
+            "1560": 4.92871,
+            "1561": 4.9342,
+            "1562": 4.93981,
+            "1563": 4.94231,
+            "1564": 5.28559,
+            "1565": 5.2926,
+            "1566": 4.93393,
+            "1567": 5.27554,
+            "1568": 5.55669,
+            "1569": 5.22897,
+            "1570": 4.93426,
+            "1571": 5.28382,
+            "1572": 4.94938,
+            "1573": 4.95055,
+            "1574": 4.94811,
+            "1575": 4.9489,
+            "1576": 5.33208,
+            "1577": 4.94524,
+            "1578": 4.94592,
+            "1579": 4.94832,
+            "1580": 4.94832,
+            "1581": 4.94408,
+            "1582": 4.93963,
+            "1583": 5.06791,
+            "1584": 4.93161,
+            "1585": 4.93335,
+            "1586": 4.93849,
+            "1587": 4.93237,
+            "1588": 4.93556,
+            "1589": 4.93066,
+            "1590": 4.94768,
+            "1591": 4.93099,
+            "1592": 4.93258,
+            "1593": 4.93981,
+            "1594": 4.92949,
+            "1595": 4.93453,
+            "1596": 4.92827,
+            "1597": 4.92584,
+            "1598": 4.93755,
+            "1599": 4.92974,
+            "1600": 4.94804,
+            "1601": 4.93191,
+            "1602": 4.93369,
+            "1603": 4.93286,
+            "1604": 4.93069,
+            "1605": 5.27051,
+            "1606": 4.92329,
+            "1607": 4.92495,
+            "1608": 5.27779,
+            "1609": 5.28346,
+            "1610": 5.29602,
+            "1611": 4.94123,
+            "1612": 4.93638,
+            "1613": 5.856,
+            "1614": 4.94437,
+            "1615": 4.93653,
+            "1616": 4.93875,
+            "1617": 4.93536,
+            "1618": 4.93896,
+            "1619": 4.93356,
+            "1620": 4.93572,
+            "1621": 5.31736,
+            "1622": 4.94531,
+            "1623": 4.94225,
+            "1624": 4.94386,
+            "1625": 4.93406,
+            "1626": 4.93798,
+            "1627": 4.93633,
+            "1628": 4.93917,
+            "1629": 4.93696,
+            "1630": 4.93053,
+            "1631": 4.92648,
+            "1632": 4.92658,
+            "1633": 4.93841,
+            "1634": 4.93342,
+            "1635": 4.9359,
+            "1636": 4.93181,
+            "1637": 4.93503,
+            "1638": 4.93642,
+            "1639": 4.93683,
+            "1640": 4.93436,
+            "1641": 4.9443,
+            "1642": 5.27794,
+            "1643": 4.94268,
+            "1644": 4.91864,
+            "1645": 4.92135,
+            "1646": 5.26653,
+            "1647": 4.93155,
+            "1648": 4.94793,
+            "1649": 4.92681,
+            "1650": 4.92909,
+            "1651": 4.92222,
+            "1652": 4.93308,
+            "1653": 5.27802,
+            "1654": 5.27831,
+            "1655": 4.92527,
+            "1656": 4.92184,
+            "1657": 4.92535,
+            "1658": 5.84478,
+            "1659": 4.93415,
+            "1660": 4.98533,
+            "1661": 4.95752,
+            "1662": 4.94766,
+            "1663": 4.94933,
+            "1664": 4.95355,
+            "1665": 4.94643,
+            "1666": 5.33217,
+            "1667": 4.93611,
+            "1668": 4.93532,
+            "1669": 4.9092,
+            "1670": 4.90894,
+            "1671": 4.9204,
+            "1672": 4.92236,
+            "1673": 4.9082,
+            "1674": 4.91286,
+            "1675": 4.90919,
+            "1676": 4.90864,
+            "1677": 4.91312,
+            "1678": 4.90871,
+            "1679": 4.92308,
+            "1680": 5.26267,
+            "1681": 4.92022,
+            "1682": 4.91096,
+            "1683": 4.91568,
+            "1684": 5.26065,
+            "1685": 4.90909,
+            "1686": 4.90718,
+            "1687": 4.91023,
+            "1688": 4.91504,
+            "1689": 4.9123,
+            "1690": 4.91353,
+            "1691": 4.90838,
+            "1692": 4.90311,
+            "1693": 4.90235,
+            "1694": 4.90376,
+            "1695": 4.90901,
+            "1696": 4.90724,
+            "1697": 4.91094,
+            "1698": 5.25776,
+            "1699": 4.91455,
+            "1700": 5.2613,
+            "1701": 4.90973,
+            "1702": 4.90149,
+            "1703": 5.82797,
+            "1704": 4.9102,
+            "1705": 4.91831,
+            "1706": 4.90187,
+            "1707": 4.89945,
+            "1708": 4.89865,
+            "1709": 4.89632,
+            "1710": 4.90065,
+            "1711": 5.28146,
+            "1712": 4.90271,
+            "1713": 4.90852,
+            "1714": 4.90365,
+            "1715": 4.90463,
+            "1716": 4.91059,
+            "1717": 5.24655,
+            "1718": 4.91868,
+            "1719": 4.90569,
+            "1720": 4.91426,
+            "1721": 4.91116,
+            "1722": 5.25454,
+            "1723": 4.91058,
+            "1724": 4.90906,
+            "1725": 4.92075,
+            "1726": 4.91839,
+            "1727": 4.91564,
+            "1728": 4.91131,
+            "1729": 4.91291,
+            "1730": 4.90884,
+            "1731": 4.91062,
+            "1732": 4.90638,
+            "1733": 4.9061,
+            "1734": 4.90658,
+            "1735": 4.91543,
+            "1736": 4.90614,
+            "1737": 4.91107,
+            "1738": 4.91084,
+            "1739": 4.90842,
+            "1740": 4.91418,
+            "1741": 4.90881,
+            "1742": 4.90792,
+            "1743": 5.26397,
+            "1744": 4.91738,
+            "1745": 5.25587,
+            "1746": 4.90599,
+            "1747": 4.90321,
+            "1748": 5.78796,
+            "1749": 4.90348,
+            "1750": 4.90858,
+            "1751": 4.89993,
+            "1752": 4.90938,
+            "1753": 4.90593,
+            "1754": 5.25406,
+            "1755": 4.9167,
+            "1756": 4.92732,
+            "1757": 5.32154,
+            "1758": 4.93234,
+            "1759": 5.25874,
+            "1760": 4.90683,
+            "1761": 4.90629,
+            "1762": 4.91525,
+            "1763": 4.91544,
+            "1764": 4.91062,
+            "1765": 4.90636,
+            "1766": 4.90873,
+            "1767": 4.91142,
+            "1768": 4.96573,
+            "1769": 4.90448,
+            "1770": 4.8891,
+            "1771": 4.8932,
+            "1772": 4.88066,
+            "1773": 4.87927,
+            "1774": 4.87496,
+            "1775": 4.90017,
+            "1776": 4.88861,
+            "1777": 4.88943,
+            "1778": 4.88632,
+            "1779": 4.89539,
+            "1780": 4.88673,
+            "1781": 4.89482,
+            "1782": 4.89261,
+            "1783": 4.88921,
+            "1784": 4.89935,
+            "1785": 4.88986,
+            "1786": 4.89061,
+            "1787": 4.88853,
+            "1788": 5.24035,
+            "1789": 5.24993,
+            "1790": 4.91207,
+            "1791": 4.91991,
+            "1792": 5.55415,
+            "1793": 5.49039,
+            "1794": 4.899,
+            "1795": 4.88922,
+            "1796": 5.25127,
+            "1797": 4.89889,
+            "1798": 4.90442,
+            "1799": 4.89627,
+            "1800": 4.89346,
+            "1801": 4.89082,
+            "1802": 5.2731,
+            "1803": 4.89886,
+            "1804": 4.87379,
+            "1805": 4.87577,
+            "1806": 4.88484,
+            "1807": 4.87576,
+            "1808": 4.86783,
+            "1809": 4.8917,
+            "1810": 4.87329,
+            "1811": 4.87182,
+            "1812": 4.8594,
+            "1813": 4.86213,
+            "1814": 4.86701,
+            "1815": 4.86025,
+            "1816": 4.86454,
+            "1817": 4.86162,
+            "1818": 4.85688,
+            "1819": 4.85907,
+            "1820": 4.85765,
+            "1821": 4.85878,
+            "1822": 4.86537,
+            "1823": 4.86101,
+            "1824": 4.86218,
+            "1825": 4.86082,
+            "1826": 4.85916,
+            "1827": 4.86304,
+            "1828": 4.86335,
+            "1829": 4.85846,
+            "1830": 5.21054,
+            "1831": 4.87227,
+            "1832": 5.20618,
+            "1833": 4.86815,
+            "1834": 5.55416,
+            "1835": 4.87798,
+            "1836": 4.89752,
+            "1837": 5.79486,
+            "1838": 4.90553,
+            "1839": 4.90533,
+            "1840": 4.89368,
+            "1841": 4.89475,
+            "1842": 4.89469,
+            "1843": 4.88557,
+            "1844": 4.89,
+            "1845": 4.88668,
+            "1846": 4.89537,
+            "1847": 5.26263,
+            "1848": 4.89245,
+            "1849": 4.89348,
+            "1850": 4.88835,
+            "1851": 4.90708,
+            "1852": 4.90228,
+            "1853": 4.86785,
+            "1854": 4.87736,
+            "1855": 4.87369,
+            "1856": 4.87811,
+            "1857": 4.90299,
+            "1858": 4.88442,
+            "1859": 4.87297,
+            "1860": 4.89531,
+            "1861": 4.90241,
+            "1862": 4.89309,
+            "1863": 4.89512,
+            "1864": 4.90549,
+            "1865": 4.90854,
+            "1866": 4.9047,
+            "1867": 5.2401,
+            "1868": 4.89946,
+            "1869": 4.90883,
+            "1870": 4.90522,
+            "1871": 4.93888,
+            "1872": 5.21372,
+            "1873": 4.87709,
+            "1874": 4.86464,
+            "1875": 4.87233,
+            "1876": 4.88054,
+            "1877": 4.84923,
+            "1878": 5.17207,
+            "1879": 5.1976,
+            "1880": 4.8445,
+            "1881": 4.84388,
+            "1882": 4.84797,
+            "1883": 5.73664,
+            "1884": 4.84672,
+            "1885": 4.84557,
+            "1886": 4.85201,
+            "1887": 4.85018,
+            "1888": 4.84932,
+            "1889": 4.85617,
+            "1890": 4.84416,
+            "1891": 4.85089,
+            "1892": 4.84881,
+            "1893": 5.22668,
+            "1894": 4.8491,
+            "1895": 4.84681,
+            "1896": 4.84529,
+            "1897": 4.84998,
+            "1898": 4.8507,
+            "1899": 4.84271,
+            "1900": 4.84844,
+            "1901": 4.84365,
+            "1902": 4.83991,
+            "1903": 4.84228,
+            "1904": 5.17846,
+            "1905": 4.84978,
+            "1906": 4.84285,
+            "1907": 4.85138,
+            "1908": 4.84338,
+            "1909": 5.19721,
+            "1910": 4.85138,
+            "1911": 4.84739,
+            "1912": 4.84478,
+            "1913": 4.85226,
+            "1914": 4.85002,
+            "1915": 4.85039,
+            "1916": 4.85444,
+            "1917": 4.84588,
+            "1918": 4.8495,
+            "1919": 4.85217,
+            "1920": 4.84949,
+            "1921": 4.84631,
+            "1922": 4.84476,
+            "1923": 5.17493,
+            "1924": 5.19107,
+            "1925": 4.85154,
+            "1926": 4.84261,
+            "1927": 5.44494,
+            "1928": 5.14044,
+            "1929": 4.84927,
+            "1930": 4.84493,
+            "1931": 4.84048,
+            "1932": 4.84204,
+            "1933": 4.84664,
+            "1934": 4.84105,
+            "1935": 4.83981,
+            "1936": 4.841,
+            "1937": 4.84038,
+            "1938": 5.22894,
+            "1939": 4.84209,
+            "1940": 4.84356,
+            "1941": 5.20657,
+            "1942": 4.9004,
+            "1943": 4.90813,
+            "1944": 4.90655,
+            "1945": 4.88214,
+            "1946": 5.21239,
+            "1947": 4.86529,
+            "1948": 4.85849,
+            "1949": 4.85084,
+            "1950": 4.86533,
+            "1951": 4.86,
+            "1952": 4.85847,
+            "1953": 4.86113,
+            "1954": 4.85194,
+            "1955": 4.85611,
+            "1956": 4.87124,
+            "1957": 4.8777,
+            "1958": 4.84686,
+            "1959": 4.84732,
+            "1960": 4.86364,
+            "1961": 4.8509,
+            "1962": 4.8663,
+            "1963": 4.87064,
+            "1964": 4.86099,
+            "1965": 4.86103,
+            "1966": 4.84569,
+            "1967": 5.17792,
+            "1968": 4.84796,
+            "1969": 5.20648,
+            "1970": 4.84901,
+            "1971": 4.84838,
+            "1972": 5.74018,
+            "1973": 4.85813,
+            "1974": 4.85367,
+            "1975": 4.86684,
+            "1976": 4.87041,
+            "1977": 4.90603,
+            "1978": 4.90475,
+            "1979": 5.25145,
+            "1980": 4.94444,
+            "1981": 4.92124,
+            "1982": 4.90832,
+            "1983": 4.94722,
+            "1984": 5.67636,
+            "1985": 4.939,
+            "1986": 4.93543,
+            "1987": 4.96136,
+            "1988": 4.92447,
+            "1989": 4.87603,
+            "1990": 4.86128,
+            "1991": 4.86822,
+            "1992": 4.86666,
+            "1993": 4.85995,
+            "1994": 4.86025,
+            "1995": 4.85738,
+            "1996": 4.86953,
+            "1997": 4.86535,
+            "1998": 4.86591,
+            "1999": 4.86231,
+            "2000": 4.86466
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json
index 9be8a9dc0ca..dc1e1921fd8 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/golden_values_dev_dgx_h100.json
@@ -1,1028 +1,1028 @@
 {
     "throughput": [
-        94.6087716527102,
-        115.85992244026639,
-        138.9562527069375,
-        133.18726531918395,
-        81.97861561771212,
-        134.30726469422635,
-        86.456140428456,
-        114.99456351298251,
-        147.3101800153954,
-        3.0364623744653003,
-        124.7590786954667,
-        134.2276982994434,
-        3.0580463134110167,
-        117.03969654341354,
-        130.92134521286803,
-        48.493091604204935,
-        1.4498729599486508,
-        128.01470907994928,
-        1.8330770354872434,
-        66.31842482241125,
-        82.24189975425459,
-        1.07058112939944,
-        1.8815468970982412,
-        0.9373246942729808,
-        134.9963160815443,
-        2.285771114682068,
-        43.068220270070434,
-        134.9677086822377,
-        82.44946740133796,
-        47.71839155542011,
-        114.4199568886962,
-        29.67621576315833,
-        144.1589742491705,
-        95.8164720809401,
-        122.80562228460093,
-        39.21436814433054,
-        3.041180292262413,
-        3.2867844729646842,
-        72.43808226229888,
-        0.8371525937296347,
-        1.2212635079980698,
-        145.6869075644325,
-        42.317711349146016,
-        109.1196064871946,
-        73.6281770453198,
-        140.4495689387567,
-        1.219834296561022,
-        138.66856497329005,
-        23.33818821323391,
-        67.82342558671365,
-        130.09683254313987,
-        147.60199288178146,
-        0.9427431720755464,
-        3.2856495013162523,
-        79.12426666101076,
-        86.41557345094756,
-        120.17346279825053,
-        137.16615251640926,
-        108.93291864542198,
-        110.10504114490513,
-        46.19253755421628,
-        0.950218846923012,
-        136.50642826951463,
-        142.73168666846448,
-        1.2206786818073785,
-        1.898581377105612,
-        131.72636154091063,
-        2.2842414327001976,
-        89.76521170090028,
-        114.66053545744656,
-        58.64474290044525,
-        0.8367865961030284,
-        128.01767795820945,
-        60.87292097103301,
-        124.20016865241587,
-        119.59336898055426,
-        0.9425820346281929,
-        93.70053305431952,
-        1.0728113870213674,
-        135.7596767309971,
-        112.89357243644062,
-        89.2743296587299,
-        137.86411291342458,
-        135.6974706051771,
-        102.59633828443238,
-        129.82058179399326,
-        139.57672703148444,
-        140.5642311163746,
-        78.49182953675201,
-        123.40912657074227,
-        82.74099904578694,
-        75.5490641626476,
-        93.38596238341951,
-        141.19058076067225,
-        1.072254167577298,
-        100.8669047802279,
-        132.77382347347034,
-        92.29086179175866,
-        137.20301032384705,
-        89.57723938765776,
-        67.5465256589703,
-        0.9498935124108836,
-        1.0716887464650027,
-        0.8365472180547067,
-        137.902625307774,
-        132.67132600219722,
-        1.45201860416265,
-        1.8366476879619427,
-        88.65095604379363,
-        132.1806036761347,
-        126.0481874394642,
-        127.43750324083169,
-        93.27238135265156,
-        109.83884164204308,
-        102.30516355984702,
-        141.10387096377744,
-        0.9425154448032942,
-        95.04281981148903,
-        103.11525529548061,
-        0.8361762901534399,
-        135.3171561172067,
-        123.30032998064965,
-        118.75691144485415,
-        82.21375599642211,
-        66.37216333263251,
-        120.02349229491865,
-        27.339414655466246,
-        133.1312422227687,
-        123.02377779863252,
-        111.0798894329,
-        58.88405247768833,
-        131.31767475108893,
-        40.19076958615912,
-        123.58362152151858,
-        130.6541142941889,
-        61.39555613504246,
-        43.92154495664044,
-        1.037012527495492,
-        127.16052127606021,
-        137.06554800183082,
-        85.67161160523041,
-        1.0253417447981334,
-        139.20903624514017,
-        140.19068787455728,
-        117.67416498245059,
-        23.410837515725987,
-        130.73052473972666,
-        22.561824695346466,
-        1.028901717647808,
-        119.30712483977753,
-        117.77548263464804,
-        135.2959098119142,
-        142.10193821260228,
-        1.0366044325624144,
-        1.0350271698893887,
-        132.8943567509843,
-        51.50353963446039,
-        113.39559408843714,
-        124.25424103796537,
-        129.60407993083075,
-        136.8566687186031,
-        1.036163010240988,
-        1.0345739017743927,
-        118.72350056844492,
-        32.453707095990595,
-        43.851925176925825,
-        139.39206855448938,
-        141.0979597861742,
-        132.81461728578432,
-        80.95956255477945,
-        133.42483643501154,
-        57.27721135575491,
-        81.47649794801364,
-        79.39765285063396,
-        56.40255861789973,
-        0.8890603607397893,
-        137.59325887086797,
-        118.03982850100024,
-        53.04390121587005,
-        88.31177924841927,
-        1.0287550608831881,
-        54.67393025836421,
-        54.73556135447348,
-        129.6143036059356,
-        123.57095756116274,
-        146.05184555314386,
-        55.506024155977386,
-        84.40666358740559,
-        62.68531518105107,
-        147.42894642823578,
-        1.0274253590993496,
-        145.9063526676371,
-        76.36231256557768,
-        1.035808949157935,
-        136.1858098182613,
-        93.13144140533397,
-        54.57886608953819,
-        1.0251956490815057,
-        1.0270063804838983,
-        67.96952180390161,
-        136.90103479290272,
-        78.62986077133174,
-        129.97235998681177,
-        70.57784076609056,
-        1.028567312218149,
-        69.64434330087829,
-        1.0266016363366386,
-        25.142311727265525,
-        139.54750333578679,
-        118.80547132463877,
-        1.0342055876192149,
-        132.79991800938092,
-        88.25494664060619,
-        132.4600307114398,
-        1.026200775415348,
-        111.33264788932784,
-        1.031301270403004,
-        104.45912302410692,
-        1.0337771723701492,
-        124.53550504281608,
-        1.0283501183885058,
-        126.53361938982871,
-        139.83512785200963,
-        102.28350299734186,
-        122.68389734539087,
-        139.27095111763788,
-        1.0333552237490158,
-        97.04945381465573,
-        60.63422077140298,
-        1.0248694052483192,
-        96.77644543721476,
-        118.38370846079931,
-        1.0309087229819596,
-        136.0487423665781,
-        1.032932214377732,
-        104.96525711514936,
-        50.75370028394122,
-        125.67617176346853,
-        125.47392048276225,
-        101.59371483024698,
-        119.1183231384482,
-        134.24568445137294,
-        1.0323996653747745,
-        119.28563313083153,
-        50.183581144589674,
-        107.50817556608582,
-        127.4693561344537,
-        116.0234844098742,
-        149.0429439759437,
-        127.77855747904051,
-        1.0319900690130652,
-        129.7400124946839,
-        60.27584011696136,
-        1.0245534026749026,
-        113.8687773549026,
-        129.9927880985222,
-        41.55332067297356,
-        12.991853549713621,
-        144.9384518471586,
-        127.77570879015505,
-        79.09214991388126,
-        1.0326234729165304,
-        144.50618896622706,
-        44.461452482592826,
-        145.75357879817352,
-        150.5618330832813,
-        123.17802281879979,
-        147.0133924731902,
-        57.07203337285457,
-        140.17944630269687,
-        44.5066568841284,
-        150.2834791394652,
-        146.37106237628518,
-        135.59553639884948,
-        21.91845075979551,
-        1.0391172002596458,
-        92.42182316100705,
-        14.98578222593142,
-        19.944740287073653,
-        32.75622847272977,
-        58.94666795839769,
-        1.0428676908165904,
-        97.94938911630567,
-        140.5399781540016,
-        36.397689902912774,
-        1.0322919875583962,
-        33.76444948259586,
-        147.54902815924785,
-        51.316830076622495,
-        153.55703202636914,
-        46.423895018386204,
-        140.271682540213,
-        1.0340651759548871,
-        85.22971449383292,
-        141.80480996358014,
-        1.0234621691055457,
-        1.0355322329825165,
-        136.96321865236195,
-        138.2293990177049,
-        136.89440582973347,
-        96.94919171687799,
-        54.992986423891566,
-        142.91167590864902,
-        138.73615931624403,
-        86.32837448704223,
-        1.0424247604140402,
-        127.58052889290863,
-        138.2472241943501,
-        1.0338260095695477,
-        1.0317372756221133,
-        150.59249576769173,
-        1.0229533138894364,
-        149.1711141084735,
-        1.0419379125129562,
-        1.040305113121658,
-        150.13261057757276,
-        62.47975017460808,
-        70.20443057037575,
-        76.88821624674898,
-        1.0225242667788867,
-        136.83301633777177,
-        1.0414381555227956,
-        131.6044067829552,
-        1.038902005769604,
-        1.0335832618537684,
-        83.38230404797935,
-        3.047737981863063,
-        140.9843162162637,
-        1.0352264324041114,
-        1.0409374510445146,
-        103.17228299164871,
-        1.0383219913492376,
-        67.5151836065632,
-        126.94018489907108,
-        95.29974174831813,
-        1.022161551972834,
-        1.0348032799350415,
-        93.24855217625235,
-        140.00831851627856,
-        142.46553219867087,
-        80.52507876480331,
-        149.47939431741142,
-        125.60095189608528,
-        92.57991472689042,
-        153.09192667088175,
-        98.78787611117323,
-        136.9802701171813,
-        1.0378200246498124,
-        79.05370338483348,
-        145.63143231877774,
-        107.86253722014555,
-        113.1390555766259,
-        150.4596904971142,
-        6.010262757833046,
-        138.11675690694213,
-        1.0371929842524894,
-        55.1702723554103,
-        148.4142582794926,
-        108.62464742566522,
-        142.2515578682958,
-        149.5588988951372,
-        1.0310870179234204,
-        32.798276334675066,
-        145.8363475163408,
-        82.52497836005318,
-        144.77105210255448,
-        140.95035733017403,
-        145.4844811663436,
-        145.0646083055648,
-        139.1641494303434,
-        1.0401220454548914,
-        146.10598185112948,
-        1.0335329080843159,
-        1.0316085392161136,
-        133.98012837767038,
-        129.62059667226987,
-        151.2681266565858,
-        1.030719335336581,
-        135.9600336007384,
-        1.0366589924031362,
-        107.70864165999221,
-        118.06361914834272,
-        148.4615541738592,
-        135.1206190516379,
-        1.0788915925864082,
-        1.0662361391973343,
-        1.0784094142292293,
-        145.5492563111853,
-        100.1745158858024,
-        89.97448812790176,
-        140.13008352060388,
-        8.378443606045758,
-        19.841723966559687,
-        31.11972559764219,
-        127.75589035167928,
-        144.649118240912,
-        83.40454687650907,
-        13.609558087727212,
-        144.14916775068022,
-        143.0831699051951,
-        144.53789580070173,
-        129.35689525213576,
-        126.54760361436873,
-        136.72725454688293,
-        83.66753329456253,
-        35.238850690537326,
-        138.73588075606074,
-        148.39285997484404,
-        141.43706957675556,
-        35.20788617289704,
-        140.22918428708584,
-        141.42288954532623,
-        80.8071906111917,
-        53.480908541665116,
-        96.60869116876205,
-        138.83030943256392,
-        146.89537016655746,
-        1.0659353965573166,
-        138.66041009897964,
-        138.0783824554628,
-        54.95061283513892,
-        1.0688789370964418,
-        145.4981195236156,
-        107.91672388693667,
-        147.39387423946786,
-        143.49840246862203,
-        1.0781871694837721,
-        125.37215873599833,
-        46.390553110182545,
-        1.0683430650310588,
-        60.55314896188811,
-        128.32962060837178,
-        142.6648214311374,
-        1.065532502621677,
-        145.06202945295232,
-        149.5985088362253,
-        43.61426254132819,
-        139.2120402464869,
-        138.80120892663803,
-        142.59390751862693,
-        147.27000174003754,
-        139.5980537408405,
-        142.37081759892675,
-        76.47257166426981,
-        0.8663971721944621,
-        1.067847671923619,
-        1.0752972325757186,
-        139.11225337731244,
-        154.1012640338781,
-        91.85315813315137,
-        7.34066705730821,
-        1.0763437477764217,
-        56.03391448680589,
-        1.067309924884827,
-        1.0747789028833068,
-        1.057667310022394,
-        146.4284745539176,
-        142.32867288307636,
-        132.81801172672715,
-        142.5746724111237,
-        43.178263922620026,
-        140.19958418325498,
-        1.0742201855279276,
-        139.95237701874325,
-        124.69044225989671,
-        89.93275546978569,
-        1.0778110524743836,
-        108.03753008375865,
-        0.8649825661375887,
-        101.22782607000799,
-        138.6615942910557,
-        1.0572642952018412,
-        143.509260845593,
-        1.0651693329533294,
-        97.454990956795,
-        1.075960473594851,
-        104.89429761368234,
-        153.46849816095335,
-        143.28204379991922,
-        112.57923589922926,
-        145.35468060283986,
-        119.53338040876814,
-        132.53105489182144,
-        146.60735281445733,
-        0.8648000721123511,
-        132.61504628627392,
-        140.81953388748138,
-        1.05684091289561,
-        147.29646966899597,
-        1.0646855258714663,
-        1.0772400203863821,
-        137.87592499226204,
-        101.79954304062817,
-        134.45893707567646,
-        1.0737967838723397,
-        147.3289039421509,
-        142.95955673278567,
-        123.11846557585149,
-        139.7223884224781,
-        5.274894457437767,
-        0.8646226703470901,
-        135.27010135142623,
-        134.53222451904563,
-        140.4520894166607,
-        148.6784682726068,
-        148.83999547746723,
-        144.76059628877204,
-        146.09818079047014,
-        0.8644123666240657,
-        133.05795012757028,
-        141.21253159110282,
-        147.08086640702987,
-        153.13511211461227,
-        147.72437078211334,
-        53.87242850230838,
-        61.34701685378028,
-        74.50771860339175,
-        16.40780504974564,
-        16.448796993269678,
-        144.08505364828036,
-        143.78069847853888,
-        145.08382905436133,
-        139.4144567792124,
-        1.113422304912727,
-        23.732299099149245,
-        146.716938504402,
-        1.1150428401994323,
-        1.1070863332993708,
-        147.462815334713,
-        15.300506166735937,
-        142.89311901203018,
-        35.881455163220174,
-        0.8959120615185874,
-        134.50389621984408,
-        79.91603718165896,
-        145.31776951960734,
-        153.19384567886857,
-        142.494036234602,
-        130.58249312188119,
-        1.1128817603274543,
-        56.157995916719756,
-        35.81413980204931,
-        116.5213087641768,
-        63.30354399512571,
-        55.0117106848875,
-        47.52954249314361,
-        153.04709230401787,
-        1.112276523473745,
-        80.1523559974256,
-        136.20373724941714,
-        1.114673225365626,
-        1.1067132158651183,
-        149.29883052073288,
-        145.10950784560325,
-        130.53765167080937,
-        1.111788125890117,
-        0.8957719496064405,
-        1.1050775451489783,
-        17.522300994030367,
-        154.45472111064055,
-        152.07616582090188,
-        1.1020107149905272,
-        138.6808068419634,
-        76.87873177159636,
-        51.43702839643221,
-        138.95045176064437,
-        138.64177504011988,
-        140.72197385602811,
-        132.80947742972836,
-        149.78872816785005,
-        139.94034036065392,
-        154.2632802491591,
-        55.57148538150843,
-        1.1044580058296936,
-        147.1712801496827,
-        77.84198065949245,
-        142.38330204183904,
-        151.76812011990265,
-        145.19131540821485,
-        147.26566215388425,
-        87.12413393605841,
-        1.1038403429439656,
-        141.4935550752979,
-        145.7397470598185,
-        3.3080164659931235,
-        123.0327553358976,
-        146.24080278853327,
-        148.10448175245884,
-        29.234562433775857,
-        151.30177873039895,
-        135.4653748135468,
-        144.3293913931314,
-        148.16163203136404,
-        1.1015876034201657,
-        1.1114790318458536,
-        136.68047783885697,
-        77.72584511329579,
-        125.73692105352463,
-        106.98755729483561,
-        96.25926845246491,
-        1.109721323323522,
-        141.71073652156545,
-        130.22006710827588,
-        145.24478945746003,
-        80.67459353439743,
-        1.1033551544760267,
-        150.03177939272493,
-        154.12875534463626,
-        150.04771421074818,
-        1.1010813815407388,
-        1.1110434127990452,
-        145.385699877379,
-        86.86487551811825,
-        130.16687493633253,
-        143.8726181331947,
-        111.91340621077623,
-        146.0394914387852,
-        1.1006353022455784,
-        134.47903589563677,
-        148.6907436994389,
-        102.87151097507036,
-        137.41724911494663,
-        1.1146766644704549,
-        143.85952373403495,
-        146.92280951248307,
-        1.100156488603178,
-        144.04783334738536,
-        148.53630346113712,
-        58.74848466983248,
-        147.0485685726298,
-        141.32891699761203,
-        142.8441702922343,
-        131.04366253726744,
-        128.6305301075303,
-        1.1106412111686195,
-        147.90025888582002,
-        0.8959265584913588,
-        149.5194069726666,
-        137.43649451567626,
-        1.1068068376551545,
-        68.05269425995475,
-        138.94056631255367,
-        138.43818227469507,
-        69.60391199895408,
-        114.83395091462887,
-        151.34107787433956,
-        141.57237630997332,
-        146.07433910500515,
-        9.941778754980154,
-        131.297822968639,
-        10.386636719874664,
-        10.545636067043365,
-        114.58677137445733,
-        75.28902943071078,
-        90.63452059810655,
-        143.58694736923238,
-        9.901118804514459,
-        144.5206530902411,
-        144.78737732574044,
-        79.81136215142409,
-        84.9314508821071,
-        120.18939827456474,
-        10.225253542151219,
-        9.702822548173124,
-        103.1188517219872,
-        138.5008491242522,
-        92.02238700298246,
-        151.99592340131602,
-        9.807595290716304,
-        150.0447954775559,
-        134.2614008494909,
-        149.38544573345007,
-        149.62298116309924,
-        124.32358754465251,
-        132.817456221544,
-        10.50607995390264,
-        9.78317681034783,
-        151.07916494121415,
-        146.93545537009487,
-        118.45851163082196,
-        145.03008316360754,
-        154.4449202186591,
-        146.86002069809945,
-        150.6932855951215,
-        110.74803327496042,
-        127.40788523389726,
-        150.81323854197058,
-        150.0047673310006,
-        149.6063654551971,
-        133.87244996538675,
-        10.329695475492791,
-        9.414695716712222,
-        106.77032789813472,
-        118.34636653947105,
-        123.44441062862572,
-        144.9015592115516,
-        153.74652990582067,
-        10.065713405335144,
-        129.38998560194165,
-        117.69087049838025,
-        99.15650839997046,
-        127.90462338199198,
-        147.3574863739125,
-        9.696544883885949,
-        9.8853852911422,
-        128.35872796896587,
-        145.2939860705264,
-        128.72081963712404,
-        94.09935653689803,
-        142.8780531031409,
-        130.5213122981276,
-        126.89288883528536,
-        153.36107852781166,
-        149.17239657923582,
-        9.177632630803961,
-        9.387171298727486,
-        109.68196882316985,
-        148.55536204011432,
-        152.61730207818772,
-        9.648922236946333,
-        132.805446535875,
-        138.74295200738652,
-        141.66118217831166,
-        124.0399127789103,
-        113.05005278683446,
-        149.71230902297984,
-        25.727698431920004,
-        129.56419655827216,
-        130.40687823665095,
-        128.46470366050013,
-        150.46298369674685,
-        9.22073843893938,
-        110.36443029340542,
-        148.23878821929193,
-        10.219508495480236,
-        9.615051521185155,
-        9.8723813087942,
-        149.91378148843256,
-        9.149056684599877,
-        130.37704092008303,
-        114.86611671621016,
-        134.53633480709703,
-        131.11593468604048,
-        149.74665952988033,
-        136.60701891253495,
-        146.50864617645632,
-        9.094221140419737,
-        149.69902295915708,
-        126.93245475406366,
-        141.2463933703881,
-        10.18172163650932,
-        136.76582155059438,
-        155.5823388453975,
-        144.68082947663285,
-        142.0128061769988,
-        116.20800508912414,
-        101.13756407758095,
-        10.050927550768915,
-        10.14139856150474,
-        9.573219645146107,
-        146.33874064646594,
-        137.22302119976462,
-        132.14965518046,
-        148.08190796641483,
-        117.6843964457568,
-        153.04352772565807,
-        146.79238076404926,
-        9.522740968586977,
-        145.93484469600287,
-        13.925952420322696,
-        12.697420287309185,
-        146.39122941822845,
-        113.94298610788566,
-        13.844109957456581,
-        154.57922917096633,
-        13.525210269101805,
-        103.83976095796662,
-        97.75660804271413,
-        135.83818209343426,
-        158.60060111529293,
-        111.57793188874757,
-        13.768524263105455,
-        154.2203592546867,
-        108.85242762118563,
-        111.15752259030245,
-        149.5942138872604,
-        119.77102605185765,
-        120.68065341205389,
-        105.29698904913548,
-        151.41465167808087,
-        138.90606724001483,
-        13.437371194424983,
-        119.97194649055415,
-        144.6223725248399,
-        146.9934910169238,
-        149.45319992777343,
-        121.48260402443249,
-        13.662736071688842,
-        14.448955892498802,
-        144.5545360346381,
-        154.00382983055897,
-        151.8635735223181,
-        137.2321484611102,
-        119.71487519948164,
-        88.24978714231261,
-        147.74815341218743,
-        142.1113258863455,
-        132.08775922189477,
-        124.63351274554526,
-        145.72256212355262,
-        100.50708502243579,
-        139.16363846809003,
-        114.82662827063822,
-        154.78307253831395,
-        149.22879563842886,
-        152.6744734255461,
-        145.81022434241217,
-        152.68018782123758,
-        116.75549006136289,
-        12.968595875688791,
-        6.824624970615158,
-        125.05116103474757,
-        147.66072487793718,
-        147.5735120742967,
-        139.1302141298083,
-        146.48542990069834,
-        12.674865288395944,
-        147.88858853602966,
-        6.8124480142416175,
-        137.54766974463703,
-        130.89979405333307,
-        13.364169845161861,
-        14.116086127002273,
-        130.3002929300388,
-        116.98398239487472,
-        152.70827610346095,
-        98.51470626500011,
-        135.1252373635164,
-        14.405992358855888,
-        154.13709739001223,
-        146.28661687368685,
-        137.87827066214206,
-        12.621081453489012,
-        154.04574874294514,
-        6.802625211185703,
-        152.18661864386252,
-        149.30257880598677,
-        13.244501725269068,
-        138.34068638798834,
-        150.95140747506372,
-        141.8441899037163,
-        152.99022366652198,
-        103.95004802425926,
-        140.28144756248412,
-        154.51222806007945,
-        85.40777548962518,
-        154.7067128296305,
-        120.47843952303268,
-        12.568053995018431,
-        12.916583075889136,
-        105.92477484543576,
-        137.92878859711615,
-        135.13853669037294,
-        137.88549737290148,
-        157.83019925734393,
-        145.48927689323145,
-        12.509532718065461,
-        150.6233829715981,
-        119.23669844460764,
-        138.49099023171033,
-        154.0870149904812,
-        140.1862744667834,
-        148.860174031694,
-        147.54629689336036,
-        12.448861769003683,
-        152.4711466483636,
-        102.47079224461186,
-        152.40864885890767,
-        156.21773232766026,
-        13.139291580904986,
-        150.30653960489693,
-        145.43571147072188,
-        132.8965387342577,
-        144.85972103961666,
-        125.5438694385711,
-        158.07457773478276,
-        14.359506122440205,
-        137.7658155977229,
-        153.68125116011197,
-        156.57780724945528,
-        12.394708947912125,
-        12.874702780202174,
-        110.61518572692995,
-        149.4338565730422,
-        149.67552030435513,
-        146.20909415912828,
-        9.308833539527914,
-        26.176147260970783,
-        8.701217384742513,
-        66.92241449340185,
-        105.12940849136734,
-        145.25326276553395,
-        139.68219350261262,
-        131.60335890332783,
-        150.53420884400245,
-        17.552483447968918,
-        99.60476667168517,
-        9.003208512207522,
-        8.539560747895454,
-        9.946172723540226,
-        150.55644446784382,
-        9.608936841972842,
-        104.80864366760326,
-        25.95068644438624,
-        99.42592550150236,
-        108.35979254469888,
-        113.9171427720856,
-        9.905905876631499,
-        131.1684982861573,
-        154.7989292174601,
-        151.34753888952145,
-        150.11816141981262,
-        143.00557828542912,
-        126.2310299151925,
-        113.53830001728545,
-        148.13405630794878,
-        150.7564429392251,
-        155.252325076404,
-        18.20048176554747,
-        25.725436761645142,
-        8.678711562613207,
-        143.3683328827327,
-        127.0294451168928,
-        137.50119476282134,
-        10.068367539846923,
-        155.64822784014916,
-        153.2789382926615,
-        25.46950813818654,
-        142.9138107220956,
-        155.10510899417167,
-        107.40557834412083,
-        9.871948602847068,
-        144.4712732194919,
-        140.17802930301565,
-        9.286026243902361,
-        129.1488895575147,
-        124.35586045151207,
-        140.1410811550992,
-        96.63692877337894,
-        153.62093095799207,
-        156.05800033315097,
-        9.587609950939838,
-        140.09721428165886,
-        134.898750425008,
-        8.652809034763463,
-        8.989448046931262,
-        107.64260577858933,
-        9.825071080298192,
-        150.6237132142087,
-        143.76058852986372,
-        154.01627264735168,
-        140.85322298632985,
-        143.63714834446708,
-        149.7259575806535,
-        8.53942846683121,
-        157.02635815805976,
-        150.83913162907433,
-        154.0283691261865,
-        9.246842209481716,
-        154.5851361854829,
-        133.4662155767381,
-        137.55396410787307,
-        105.77910782321499,
-        148.97953057255376,
-        111.3041581371634,
-        9.543858351726714,
-        142.71996301994741,
-        144.2417836324451,
-        148.5293262803374,
-        8.95331376662564,
-        105.2724164655814,
-        149.16646109060707,
-        151.1947852118465,
-        9.503293907683512,
-        133.40055362812345,
-        8.776394391795916,
-        148.3675722527084,
-        154.66946641450528,
-        122.71674068416665,
-        149.62192317697068,
-        153.40159484208397,
-        9.46860898864519,
-        146.10526710538994,
-        143.96020057925128,
-        8.62472208077336,
-        8.906885562515198,
-        105.7754218686014,
-        150.17957794387223,
-        144.0451331512576,
-        149.95461039551162,
-        151.46311089131117,
-        142.22104279807664,
-        147.3679944003333,
-        140.5394711174869,
-        123.62157744638432,
-        152.32796921399395,
-        156.6603241829257,
-        9.43621164630811,
-        158.2241383954169,
-        149.33346139426692,
-        144.12074054746773,
-        143.1977521817863,
-        8.536662624511228,
-        9.785635570067782,
-        147.61880087321424,
-        9.402323265876474,
-        159.1161790596516,
-        146.56796834276156,
-        147.64890403285438,
-        157.70847517328534,
-        114.64282143770687,
-        148.5000942425868,
-        10.052761003641129,
-        147.38801074409378
+        98.47864949895008,
+        63.93792629897559,
+        166.49088904974073,
+        148.10611103663214,
+        136.93608898138933,
+        153.87586308063382,
+        90.56559317052603,
+        128.5291550251628,
+        162.07670305023993,
+        4.196475118529487,
+        147.98743190294235,
+        149.72190006929446,
+        1.1777631788022311,
+        133.74963259040626,
+        150.11088322452974,
+        51.863180020864455,
+        4.139051494405947,
+        79.2557164919149,
+        1.6071996867452278,
+        70.01915930069646,
+        137.26891673137558,
+        1.0402098481802287,
+        1.8594022431966566,
+        2.039486534010741,
+        146.2938256177694,
+        4.149796716964247,
+        46.34667799086249,
+        151.47361823216394,
+        137.54739677623354,
+        51.120748066850325,
+        136.84512611150544,
+        32.11962977236786,
+        157.56752902839474,
+        47.12119148820226,
+        145.7314367353006,
+        42.20270560372231,
+        1.0426098595499007,
+        3.5892682955617827,
+        76.57100636536596,
+        1.612496526198,
+        2.6881979572654413,
+        111.88402006134972,
+        45.58338247702666,
+        111.4111889571842,
+        132.16301113659247,
+        161.64295403385984,
+        2.664705818704618,
+        157.1638935590632,
+        25.286871922093454,
+        37.4310109209181,
+        153.65911351957632,
+        170.7256762539797,
+        1.042128189044151,
+        3.5869040413041917,
+        83.30261586197105,
+        90.55970202339806,
+        132.9415846015795,
+        95.80834182322752,
+        112.4369142570399,
+        130.7156977512895,
+        90.98968148626129,
+        0.9371270459059615,
+        159.09279181195387,
+        162.9970081970886,
+        2.6700708026356366,
+        1.8557378891084773,
+        156.12103246797463,
+        1.3653778104766194,
+        143.46571269908148,
+        130.6346250925551,
+        62.46023289115923,
+        1.6116060776090406,
+        139.8111163213305,
+        34.86018737886305,
+        146.06865198079345,
+        133.96801334258495,
+        1.0417626130871034,
+        97.53781169320182,
+        2.0478975910586503,
+        151.90776052541932,
+        126.40035137658552,
+        44.78808603802679,
+        163.9803901721219,
+        152.78287546210825,
+        154.77428093351637,
+        145.74430748169019,
+        163.03421864587594,
+        146.28703545539014,
+        82.55934081518444,
+        73.53123347847824,
+        87.20650201489909,
+        79.6237289961617,
+        146.76012425672718,
+        162.46398331888344,
+        2.046000130560097,
+        104.11707807083185,
+        142.7981951169222,
+        45.781111784259096,
+        164.13498801895528,
+        93.34392878508068,
+        127.09756182184553,
+        0.9369885821746623,
+        2.0440080852076448,
+        1.6107470231739485,
+        149.4484511068655,
+        87.5539915318001,
+        1.3670348174101508,
+        1.1796264961520015,
+        142.53546263417087,
+        150.2065859393766,
+        145.65883203776818,
+        142.2125733485302,
+        96.99016545580078,
+        57.32416740237564,
+        106.63530054957698,
+        159.19142654590536,
+        1.0415326032228118,
+        98.71719677010607,
+        106.73175053259962,
+        1.6100826372227688,
+        146.64805335844048,
+        72.59518577946031,
+        142.34132184480842,
+        85.94240702745647,
+        126.17687901514078,
+        135.7696701691411,
+        29.62308081982307,
+        148.2421144346034,
+        130.36261145275355,
+        53.13931721337651,
+        60.51160243931191,
+        141.54695622051943,
+        73.11803837069677,
+        137.21251141324606,
+        148.63844490308944,
+        62.8404582738594,
+        45.401831957608,
+        0.9643006239654945,
+        147.2298500624911,
+        151.91506054646217,
+        140.48716103219812,
+        0.9577624967779577,
+        160.06459889404132,
+        155.2359539910114,
+        126.59645077786885,
+        15.69438649059929,
+        152.80784197867072,
+        23.527136960081226,
+        0.9561607658842026,
+        135.304826702121,
+        142.47511264536794,
+        149.8501903787043,
+        151.43523022097875,
+        0.9640793717349251,
+        0.9631519875374979,
+        145.2950579689095,
+        104.16937732598902,
+        131.1708059930721,
+        144.18743838648734,
+        143.6919419808989,
+        145.5428193502994,
+        0.9638106812588461,
+        0.9627615573404509,
+        116.54193238808332,
+        54.308902955274014,
+        45.33558667751163,
+        159.57290743060722,
+        156.60366994005867,
+        142.03263718363198,
+        40.71403223415776,
+        155.40510615972553,
+        58.6681100653237,
+        137.0437576533739,
+        80.42300690375168,
+        58.033083103031665,
+        0.9693871919683402,
+        145.73573001557583,
+        60.44621412824422,
+        54.994288450325136,
+        88.73692291143061,
+        0.9559459748869998,
+        56.08954858644736,
+        56.31747770886735,
+        142.34693049846092,
+        132.51002333480037,
+        108.96587128971876,
+        57.39669142091791,
+        85.1254544103699,
+        122.1342568773111,
+        170.14800453897098,
+        0.9667745869936778,
+        164.77118206030752,
+        77.67607540068808,
+        0.9637172808805204,
+        159.27278631745818,
+        93.32941075871183,
+        114.31154051585622,
+        0.9577271441482065,
+        0.9663851340406727,
+        69.18116638176265,
+        145.49566595839337,
+        39.99458755398874,
+        151.72058228459386,
+        71.71902007184255,
+        0.955684788125637,
+        70.8845735459765,
+        0.9659986810119839,
+        26.22947505868186,
+        149.5122587573231,
+        62.37088691999424,
+        0.9626226162613168,
+        144.16390862207493,
+        143.18707878361667,
+        148.34680655358588,
+        0.9655981786202157,
+        128.6357514760558,
+        0.972457638109508,
+        47.97113131021637,
+        0.962257594040168,
+        135.91488529586792,
+        0.9555101570399641,
+        139.87244415060783,
+        161.80374363862717,
+        102.03749537949356,
+        119.90228156989667,
+        95.01508726085196,
+        0.9618747782794568,
+        97.04528669323962,
+        124.83482655795,
+        0.9575074351185681,
+        97.4749088017089,
+        143.04337002379702,
+        0.9720616869548507,
+        88.4343283770829,
+        0.9616266920922193,
+        104.03159874923712,
+        102.89124420706305,
+        140.2496100327507,
+        143.1710058572335,
+        101.42975069052237,
+        128.03336431254732,
+        85.69336920713639,
+        0.9613543134449882,
+        104.07697069101184,
+        100.02889226751559,
+        106.63283752921622,
+        144.57311516379912,
+        126.07240879815421,
+        161.55730431091774,
+        73.12112420438781,
+        0.9589217273481213,
+        142.0323058738417,
+        122.36148204858885,
+        0.9572538602096321,
+        112.98246752660035,
+        142.34355181617389,
+        41.04230698700827,
+        8.473685991981666,
+        170.80637904469666,
+        142.97081601431356,
+        140.00938953689527,
+        1.0308124281925075,
+        163.68673254202156,
+        43.76708184183388,
+        152.25998257998737,
+        111.67117755812934,
+        145.80673033340165,
+        160.967274593742,
+        121.82423347589321,
+        151.58970194946951,
+        43.836717431814456,
+        168.33474851388928,
+        152.8971313956712,
+        72.9024488252911,
+        21.820779024213074,
+        1.0392675847166184,
+        147.87020150991353,
+        14.897143028689484,
+        19.847221148151032,
+        32.431828340180246,
+        57.7813822991841,
+        1.0334876773950952,
+        94.25591710682407,
+        151.42229388821934,
+        62.73982551986958,
+        1.0305004930196628,
+        33.431851137208405,
+        162.37672318207316,
+        50.321107844780045,
+        120.0631996858246,
+        45.868384609266045,
+        150.25509288811767,
+        1.03641668355906,
+        82.19687660990678,
+        158.74432925111145,
+        1.041876067399849,
+        1.0459490020450795,
+        74.46636703262733,
+        159.72092018884473,
+        145.89909226306747,
+        151.4623812014693,
+        53.96440008638893,
+        159.793887362778,
+        148.37554042172758,
+        83.3128358383083,
+        1.033330707971675,
+        134.17516572064534,
+        146.71192985844118,
+        1.0352015128775223,
+        1.030228349427348,
+        173.4020929881413,
+        1.0414756431813357,
+        157.44806749626466,
+        1.0330400451866075,
+        1.0430419707188734,
+        167.82243267657728,
+        143.8312255273241,
+        68.13449792020043,
+        74.35987547428464,
+        1.0410410061956523,
+        144.46694632543532,
+        1.0327651323294085,
+        150.93003222189313,
+        1.0391803120976406,
+        1.0348231697568464,
+        80.60319434281541,
+        3.1207628480728475,
+        151.16210456830606,
+        1.044348655121621,
+        1.0324784232146003,
+        99.42447225407219,
+        1.038776111100077,
+        132.7893754958314,
+        146.8726662885585,
+        91.5964670484325,
+        1.0406970130016908,
+        1.0437330582244273,
+        42.28479249749239,
+        162.83839126288393,
+        151.86715746595317,
+        140.5094808302986,
+        170.2080960063118,
+        131.07684807335298,
+        88.96862061056908,
+        163.9922734476757,
+        44.213460221990154,
+        157.8010866400773,
+        1.0382665374856965,
+        139.57673454433854,
+        163.7758432408245,
+        102.99718171708128,
+        107.60774917922078,
+        159.16551335735969,
+        4.119717517454783,
+        160.5803771988876,
+        1.0378430568380714,
+        115.44357851711793,
+        167.4238211695712,
+        103.79633528746076,
+        154.03506418556444,
+        159.03692094687025,
+        1.032427282609682,
+        32.52187142118156,
+        158.57750457420016,
+        141.67055142208721,
+        160.71458938698333,
+        157.24106314480454,
+        157.40833384009724,
+        150.60022387354616,
+        80.91896448664748,
+        1.0430666391532655,
+        160.36671183081978,
+        1.0347878859497883,
+        1.030293958907628,
+        147.50533105226975,
+        152.4875796332852,
+        160.31618334728296,
+        1.0321960030040243,
+        156.27786873980907,
+        1.0375321120324796,
+        160.4885833961135,
+        111.93639192506156,
+        172.24078944530834,
+        145.3287404427809,
+        1.0880735082543522,
+        0.7878037099331565,
+        1.0864480413552253,
+        158.40272521901554,
+        155.28074693629694,
+        87.44836891077435,
+        155.54752700738993,
+        8.411714256180034,
+        19.862348977650086,
+        18.35501539895094,
+        163.43115890247273,
+        157.8836387689617,
+        143.68115882020365,
+        13.66284888141665,
+        160.7292101444063,
+        155.01427847930626,
+        150.31432418581997,
+        60.81928120084204,
+        145.3926688034953,
+        145.30123372502598,
+        144.98393507215505,
+        35.18970147025731,
+        153.82777107784506,
+        164.23228082777166,
+        145.88278452124027,
+        20.46954502286418,
+        162.0360370063431,
+        150.43884956663888,
+        142.41966677764808,
+        53.07266306010992,
+        93.50532435009316,
+        150.1523142285131,
+        152.33361454488718,
+        0.787209685332213,
+        159.9704569183677,
+        147.66926829001207,
+        116.31853611522087,
+        1.0774618364125428,
+        164.22843982362895,
+        103.98183305676696,
+        152.52952151222078,
+        90.29170862480086,
+        1.0862563048060565,
+        118.53710658997939,
+        90.19968385647951,
+        1.0770089089852286,
+        59.61890934626195,
+        134.6160499563656,
+        147.6477708991394,
+        0.7870687303401608,
+        171.47874197919785,
+        165.99226887272076,
+        83.5080960308232,
+        151.55871514895225,
+        154.9605789451006,
+        154.1866343413245,
+        152.69380076313175,
+        78.46281024467942,
+        165.86076250975873,
+        74.6681179766703,
+        1.0816751050475706,
+        1.0766059511099162,
+        1.091025249207128,
+        151.61539901543878,
+        165.44997737983917,
+        41.75139614518547,
+        7.388178711598297,
+        1.0848156120039962,
+        121.93333712957133,
+        1.0761843006794773,
+        1.0905643992997778,
+        1.075801598924969,
+        151.57738041471748,
+        87.38815331117043,
+        154.57766374016802,
+        153.3353461131615,
+        81.63500323812801,
+        153.88446167160095,
+        1.0900521500553328,
+        151.65017721794743,
+        118.01864188919838,
+        40.91238161739305,
+        1.0860502574663193,
+        103.72384951664927,
+        1.081356861209966,
+        97.70962808524236,
+        153.30715221364136,
+        1.0754011583086598,
+        149.80888083526256,
+        0.7870161596702333,
+        95.11588780527678,
+        1.0824954483404,
+        159.0909827809553,
+        176.4607736857684,
+        160.28483143240214,
+        108.14616986068252,
+        150.64495962435973,
+        49.52814184554448,
+        152.62988882612356,
+        161.40766773375927,
+        1.0809227984149974,
+        150.0601857860385,
+        156.59538854909297,
+        1.072689949598873,
+        152.81205676706514,
+        0.7868728895290079,
+        1.0857058881477388,
+        143.4694111503961,
+        159.8022996153893,
+        144.9300712596306,
+        1.089757442067835,
+        160.11340438331118,
+        132.79626776787333,
+        50.38448421210805,
+        162.42137561579725,
+        5.284417747700096,
+        1.0805116052247719,
+        145.73004732672527,
+        152.59775665509528,
+        151.63963715309214,
+        155.59850627759238,
+        104.41906641764095,
+        169.89843638971865,
+        158.37348320912855,
+        1.0800687750785642,
+        149.5543247935483,
+        156.60712632191078,
+        159.6236209903005,
+        163.09782416725415,
+        98.6328505039743,
+        53.85030009718123,
+        61.00364034342645,
+        142.05505100830447,
+        16.614192215593924,
+        16.582992843952567,
+        154.47389623241062,
+        150.9101058615698,
+        90.42581449278116,
+        159.53144787295545,
+        1.1253578624639393,
+        38.131573465314304,
+        163.695564516746,
+        1.1316048014866884,
+        1.1159054012388119,
+        152.5411314388352,
+        111.46983099035936,
+        168.09092507016115,
+        36.13058934697122,
+        1.1197910040154087,
+        142.05200673526159,
+        78.09074458708291,
+        157.63502242964265,
+        162.03218881710688,
+        80.0426703374817,
+        164.26384362727924,
+        1.1222030060702506,
+        123.66591496581279,
+        35.97653651285592,
+        112.29012034978103,
+        62.69199102131731,
+        54.806250360805244,
+        25.5070616004963,
+        187.35211092519995,
+        1.1217003700976045,
+        145.32823111763997,
+        145.9166945337544,
+        1.1301150192515073,
+        1.1155615329029929,
+        154.1440872758632,
+        88.5586247200791,
+        161.60021419086345,
+        1.121175594981433,
+        1.1194211460505468,
+        1.1184405197027008,
+        17.60883897305572,
+        174.5134372600641,
+        160.45245655990746,
+        0.8166461657826791,
+        160.30564706046655,
+        75.44218827386376,
+        108.54547521267394,
+        150.49806131791814,
+        153.04150189313873,
+        150.40965861420275,
+        125.63958433236749,
+        103.12983995128599,
+        164.17811633308784,
+        175.52459662743908,
+        121.09400696724566,
+        1.1180201884652679,
+        166.27365155489332,
+        76.42072368500718,
+        146.90227613796094,
+        110.70803654586257,
+        171.79379505267624,
+        158.67043375351244,
+        147.76280504628218,
+        1.1175125336867027,
+        156.89279233182117,
+        158.0652757498143,
+        3.343340016597665,
+        49.779892185016756,
+        173.36352621939335,
+        162.4424006508065,
+        49.49838297370054,
+        173.86161362836785,
+        128.03796900006384,
+        155.68412076198788,
+        137.87250806830016,
+        0.8165665367853991,
+        1.1298869482124425,
+        159.26492424008396,
+        144.56503533715272,
+        120.55988523349636,
+        103.3722869693168,
+        93.5099865200851,
+        1.1209786631771586,
+        77.46613714395933,
+        153.76092950699294,
+        154.6841596167678,
+        146.17966014780984,
+        1.1171782471429414,
+        172.55763339822,
+        174.99117233418923,
+        157.46750414970307,
+        0.816424346577868,
+        1.1274076620999394,
+        157.20421311127953,
+        148.14748951821153,
+        149.14697533706817,
+        158.95389608842163,
+        107.97531407241593,
+        151.43640801793904,
+        0.8162494126902972,
+        157.001545737823,
+        163.80848036600747,
+        158.89222886851297,
+        147.3506488140666,
+        1.1133445391411512,
+        153.68284200756125,
+        151.8834177926471,
+        0.8160708323289537,
+        136.43010052273473,
+        162.61423354524993,
+        133.02570532111102,
+        137.86961562609895,
+        133.04901735700332,
+        154.9473181767413,
+        123.93507737689346,
+        50.83204611520686,
+        1.1270195451857552,
+        161.44093109510388,
+        1.1195708009057284,
+        169.64321510449827,
+        129.6089117511605,
+        1.115946234318508,
+        60.34621183821726,
+        101.29881161208688,
+        160.90062346193574,
+        68.21783931047266,
+        154.41899008326143,
+        174.07515811573973,
+        159.677356250512,
+        159.67728671666873,
+        9.799978913114145,
+        94.27732771999344,
+        9.60214441506233,
+        9.392617132404062,
+        155.0463449410919,
+        71.59183194783785,
+        87.06866691125934,
+        157.13349078706932,
+        9.759436169606595,
+        110.44611293008246,
+        171.1626230380253,
+        89.85437363374635,
+        107.09248087440588,
+        126.74466225447065,
+        10.009602057141537,
+        9.177527712733529,
+        99.62101604875475,
+        102.08957950312852,
+        99.71118980213345,
+        175.89684251359242,
+        10.182586030301673,
+        171.66004511817064,
+        148.24171173832124,
+        164.5397331583309,
+        158.71440804719356,
+        86.55832242496149,
+        148.610396831239,
+        9.368509685917438,
+        10.136730874821687,
+        173.75231796226313,
+        168.18072479771067,
+        125.24195815296933,
+        151.26149869648452,
+        130.6197551882794,
+        174.23395009631983,
+        170.65779238484487,
+        148.1296912550562,
+        131.11524857886738,
+        177.99920893337523,
+        167.5808938510404,
+        158.60603057794222,
+        93.6097533900039,
+        9.587874811966838,
+        9.33150536695352,
+        141.2149869829261,
+        117.88939818622781,
+        133.45305575288236,
+        156.7555665933833,
+        166.2992810974147,
+        8.762060933047495,
+        147.60747975090285,
+        125.39702986854361,
+        126.29551477783566,
+        133.3684883476696,
+        169.84463465109542,
+        9.160889914093532,
+        9.75005007182584,
+        91.25897804548956,
+        171.15603143396729,
+        137.11852945151446,
+        119.70724002664221,
+        157.24098320319794,
+        144.12095644229885,
+        131.6771710258767,
+        164.00686483698965,
+        120.71707004833677,
+        9.560442320047777,
+        9.299425721987362,
+        147.15785637439873,
+        170.64643820040646,
+        181.465984660646,
+        9.098182272291353,
+        131.1874185050373,
+        100.18931014367688,
+        166.410568062446,
+        135.47929425317378,
+        151.28962080931584,
+        169.34032285811423,
+        24.163402926519016,
+        130.3951109594527,
+        133.85939391500654,
+        91.24306358260182,
+        183.98754016151273,
+        9.265911045247684,
+        147.14244062731618,
+        165.66255588662568,
+        10.016411965833509,
+        9.03577437369573,
+        9.70728564931857,
+        122.8213056543772,
+        9.533743128327513,
+        143.45968503667223,
+        155.32709571771161,
+        141.06113578797667,
+        145.47889938004263,
+        167.35960747366406,
+        138.12559014567552,
+        116.75045269404782,
+        9.4953352412109,
+        170.07468770066882,
+        172.07629747140533,
+        155.39552706715028,
+        9.96413703689447,
+        144.65169143749998,
+        169.142417216155,
+        112.76319305930042,
+        166.30777737368877,
+        123.90774653996388,
+        132.11710295459207,
+        8.76790539542995,
+        9.923343461828647,
+        8.972068632607057,
+        152.30472233633313,
+        99.16466897297458,
+        147.39899220637375,
+        167.5046285318718,
+        158.30798003347417,
+        176.8098098029006,
+        169.2000502496997,
+        8.908205534006084,
+        147.04973272590675,
+        6.01978171115786,
+        12.908947280828421,
+        161.20885865837164,
+        154.1041738397025,
+        12.160112764259807,
+        183.9484777068351,
+        13.885015446203202,
+        103.27604069377547,
+        68.56270954501308,
+        153.39985703870556,
+        188.5641680250544,
+        151.39232245655768,
+        12.050089294787492,
+        183.13047361941102,
+        114.09672566233004,
+        109.88264169611061,
+        118.56400136868983,
+        130.5787804713655,
+        131.9836940557652,
+        139.4770525169641,
+        172.40959805680149,
+        153.8901427211502,
+        13.813000129286806,
+        115.17874112168954,
+        106.36053561017184,
+        174.23315480590185,
+        169.50614560985875,
+        163.7261937236369,
+        11.957456410326769,
+        13.191395790527517,
+        152.6369175652841,
+        164.00689931377138,
+        124.1532871601288,
+        158.7919901602378,
+        126.3012920481913,
+        110.01300143579287,
+        166.51966455859474,
+        161.16730547199728,
+        137.84358628055278,
+        123.59630141121379,
+        109.08989919709578,
+        113.66676604314083,
+        150.33107775824936,
+        155.76683850736808,
+        180.51837524079605,
+        172.49809361722134,
+        171.2412543685433,
+        146.52428847969958,
+        125.58622347928333,
+        132.3599749727434,
+        14.245461215559237,
+        11.847782329285673,
+        127.588030395774,
+        169.03076884237493,
+        160.74766094154035,
+        141.23866796872034,
+        111.11477769019474,
+        12.898248376303878,
+        164.38673745815677,
+        11.791663338710885,
+        148.00296428763687,
+        140.16323874251623,
+        13.719781371654578,
+        14.365561456573998,
+        89.32211257795143,
+        135.76622159161508,
+        175.86032158817434,
+        128.11591032818185,
+        141.79940543502275,
+        13.157166878859636,
+        176.72190145631947,
+        146.35619986228915,
+        98.02869268663022,
+        12.811778712246966,
+        178.01632978541917,
+        11.747222913476566,
+        173.95822172954252,
+        172.47660061508643,
+        13.568556768695913,
+        135.1198744591959,
+        122.01181780569887,
+        165.54722192942938,
+        176.91918611654273,
+        135.48421254380435,
+        152.73279297531656,
+        183.54215600068494,
+        94.73349204436757,
+        165.3454353780521,
+        84.12230571074015,
+        12.73103339619439,
+        14.20676756417383,
+        140.07559949201985,
+        145.7554344839868,
+        148.14304437101455,
+        144.7060493293736,
+        173.19895239158285,
+        107.2396185797313,
+        12.648044488473259,
+        173.48665402770794,
+        161.86284234640354,
+        144.49958539317737,
+        183.33130603616738,
+        149.75316477343017,
+        153.29421953478465,
+        112.95288962968242,
+        12.55136585792316,
+        173.4614521532605,
+        136.4085114015674,
+        173.79337782013562,
+        186.83123762499903,
+        13.449155280150386,
+        155.12272657027916,
+        108.84862656043424,
+        150.6559527232612,
+        161.90374448992205,
+        169.6874597897037,
+        185.13622778245175,
+        13.139280888748093,
+        148.81997444276612,
+        162.826727139871,
+        134.34831771089154,
+        12.478143605322522,
+        14.14151231689335,
+        149.25750191310448,
+        167.44106770036936,
+        170.90279518575983,
+        157.84394143590183,
+        8.296884066877869,
+        7.386407378393029,
+        8.177010477741181,
+        60.00030364994894,
+        137.35670186784466,
+        151.41307554547254,
+        150.53265674110258,
+        141.36182090288565,
+        154.42392832445645,
+        14.407560995301617,
+        113.05280253165802,
+        8.771319013508563,
+        7.756832533799784,
+        7.915167569814742,
+        172.68555416184375,
+        9.246590778625794,
+        112.20973585271739,
+        7.374925625154626,
+        111.78749154901601,
+        119.95753341645725,
+        154.77722687049408,
+        7.888580292543184,
+        149.23559365306315,
+        175.52342653145377,
+        158.75097413261327,
+        114.24446296440473,
+        167.67413927012774,
+        139.92437779140218,
+        153.0991583611961,
+        159.7319334713746,
+        175.77990646480632,
+        175.13373633806003,
+        16.006580912678864,
+        7.364167548538875,
+        8.116426613758023,
+        153.43476931019558,
+        172.67401521610824,
+        137.25039229504623,
+        23.94869767384389,
+        175.71290886984852,
+        160.1562681126053,
+        7.350730708586878,
+        168.18537884347361,
+        177.44645900467552,
+        144.69151322813394,
+        7.870439881886282,
+        158.459275555328,
+        137.04634114797315,
+        8.28967641118504,
+        89.24830243345173,
+        138.72903724038372,
+        144.82472911115988,
+        132.17749274525417,
+        174.5610183503014,
+        187.22444190737485,
+        9.045633413519324,
+        144.6870829429866,
+        94.13484353638168,
+        8.073564944014072,
+        8.763589893125177,
+        144.5449141719037,
+        7.849459359846659,
+        172.45229931306682,
+        148.0354241542905,
+        164.34364023912008,
+        96.62180529545114,
+        168.59818307908336,
+        164.06742901634536,
+        7.6595131274639785,
+        182.08298206042065,
+        174.01829936632802,
+        168.2984620634042,
+        8.26924767633141,
+        127.74401148092224,
+        152.9540589411171,
+        149.70999922953388,
+        138.05370099020382,
+        163.98282164395957,
+        109.96934554907047,
+        8.996791001407733,
+        144.24233837720223,
+        98.41465480385448,
+        176.36506372732458,
+        8.593325518249502,
+        139.963285147378,
+        165.79681947035346,
+        175.45780184642518,
+        8.96329041536684,
+        125.14956588858662,
+        7.352306186940013,
+        177.70817869555052,
+        177.69694242040705,
+        165.7171562780864,
+        159.43363801309405,
+        181.71342376901586,
+        8.928314868453635,
+        147.12698777390037,
+        98.19159288935101,
+        8.060910754944894,
+        8.549652936388815,
+        141.01923634310606,
+        163.73524549575265,
+        160.94825111954881,
+        163.89847325356007,
+        158.8883048029801,
+        101.49136858702906,
+        175.66290061319754,
+        149.61494347618603,
+        167.11717553963226,
+        172.02372197225566,
+        187.6631035218658,
+        8.900520682145078,
+        171.61286914605415,
+        115.54311347996163,
+        169.3438620700623,
+        158.53427223473756,
+        7.62769842231285,
+        7.851152107489017,
+        166.35296262059944,
+        8.869163971328895,
+        174.27048762162556,
+        105.1060664928901,
+        176.44597164262635,
+        183.93578989094215,
+        155.56348841330345,
+        165.030948332619,
+        24.021020986288374,
+        158.54498277925777
     ]
-}
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml
index aa4fde5e512..4b3fe856099 100644
--- a/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml
+++ b/tests/functional_tests/test_cases/gpt/gpt_dynamic_inference_tp1_pp1_dp8_583m_throughputtest_zmq/model_config.yaml
@@ -44,7 +44,8 @@ MODEL_ARGS:
   --inference-dynamic-batching-buffer-size-gb: 20
   --inference-dynamic-batching-cuda-graph-max-tokens: 2048
   --cuda-graph-impl: local
-  --cuda-graph-scope: full
+  --cuda-graph-scope: full_iteration
+  --no-check-for-nan-in-loss-and-grad: true
   --disable-chunked-prefill: true
   --dist-ckpt-strictness: log_unexpected
   --inference-ckpt-non-strict: true # To handle the extra_state errors
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json
index 6034ed16a35..5db3c841bb2 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective/golden_values_dev_dgx_h100.json
@@ -120,8 +120,8 @@
         "values": {
             "1": 1027089408.0,
             "2": 1027091968.0,
-            "3": 1027088384.0,
-            "4": 1027088384.0,
+            "3": 1027087360.0,
+            "4": 1027088896.0,
             "5": 1027090944.0,
             "6": 1027091968.0,
             "7": 1027088896.0,
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json
index 7e5e72f6f2c..e49d958cfdc 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM/golden_values_dev_dgx_h100.json
@@ -120,8 +120,8 @@
         "values": {
             "1": 1027089408.0,
             "2": 1027091968.0,
-            "3": 1027088384.0,
-            "4": 1027088384.0,
+            "3": 1027087360.0,
+            "4": 1027088896.0,
             "5": 1027090944.0,
             "6": 1027091968.0,
             "7": 1027088896.0,
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json
index ef8ee741272..a76d8667ec6 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_pp2_ep4_etp1_mtp_resume_torch_dist_fp8/golden_values_dev_dgx_h100.json
@@ -2,109 +2,343 @@
     "lm loss": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 11.0475,
-            "5": 9.43078,
-            "10": 8.89238,
-            "15": 7.93732,
-            "20": 7.77942,
-            "25": 7.61408,
-            "30": 7.57234,
-            "35": 7.15189,
-            "40": 7.48085,
-            "45": 7.12056,
-            "50": 6.96054
+            "1": 11.0474,
+            "2": 11.03765,
+            "3": 9.6074,
+            "4": 9.2648,
+            "5": 9.42291,
+            "6": 9.09511,
+            "7": 9.12753,
+            "8": 8.75686,
+            "9": 8.61627,
+            "10": 8.89295,
+            "11": 8.37933,
+            "12": 8.39932,
+            "13": 8.32626,
+            "14": 7.81437,
+            "15": 7.93661,
+            "16": 7.99492,
+            "17": 7.95458,
+            "18": 7.67733,
+            "19": 8.07234,
+            "20": 7.78815,
+            "21": 7.48342,
+            "22": 7.48177,
+            "23": 7.34879,
+            "24": 7.34465,
+            "25": 7.61117,
+            "26": 7.01605,
+            "27": 7.54878,
+            "28": 7.26655,
+            "29": 7.43507,
+            "30": 7.56529,
+            "31": 7.32669,
+            "32": 7.50645,
+            "33": 7.5577,
+            "34": 7.60977,
+            "35": 7.14607,
+            "36": 7.00597,
+            "37": 7.34071,
+            "38": 7.11796,
+            "39": 7.46649,
+            "40": 7.47443,
+            "41": 7.41032,
+            "42": 7.17365,
+            "43": 7.16495,
+            "44": 7.34265,
+            "45": 7.10918,
+            "46": 6.83934,
+            "47": 7.22335,
+            "48": 7.05732,
+            "49": 7.53394,
+            "50": 6.95951
         }
     },
     "num-zeros": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 38802620.0,
-            "5": 243556240.0,
-            "10": 716187584.0,
-            "15": 614358336.0,
-            "20": 677963584.0,
-            "25": 736321856.0,
-            "30": 505223648.0,
-            "35": 548946176.0,
-            "40": 412329664.0,
-            "45": 376634624.0,
-            "50": 205546672.0
+            "1": 38802536.0,
+            "2": 38543540.0,
+            "3": 38739408.0,
+            "4": 273756736.0,
+            "5": 205853584.0,
+            "6": 284244640.0,
+            "7": 652227968.0,
+            "8": 790994816.0,
+            "9": 762295424.0,
+            "10": 665870592.0,
+            "11": 618336384.0,
+            "12": 639816192.0,
+            "13": 699169600.0,
+            "14": 620502464.0,
+            "15": 623699456.0,
+            "16": 847396864.0,
+            "17": 601834432.0,
+            "18": 642855744.0,
+            "19": 668078912.0,
+            "20": 574651008.0,
+            "21": 608590080.0,
+            "22": 599821504.0,
+            "23": 558380672.0,
+            "24": 688014720.0,
+            "25": 500623296.0,
+            "26": 532887808.0,
+            "27": 506526976.0,
+            "28": 450900800.0,
+            "29": 528748480.0,
+            "30": 445603872.0,
+            "31": 457250368.0,
+            "32": 400653888.0,
+            "33": 347460640.0,
+            "34": 268919904.0,
+            "35": 495515584.0,
+            "36": 332139008.0,
+            "37": 446760768.0,
+            "38": 391328576.0,
+            "39": 378290400.0,
+            "40": 261331328.0,
+            "41": 368680832.0,
+            "42": 337485280.0,
+            "43": 337755968.0,
+            "44": 324657920.0,
+            "45": 216104608.0,
+            "46": 218159872.0,
+            "47": 302569184.0,
+            "48": 296505312.0,
+            "49": 280170176.0,
+            "50": 268486912.0
         }
     },
     "mem-allocated-bytes": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 7321331200.0,
-            "5": 7321333248.0,
-            "10": 7321333248.0,
-            "15": 7321333248.0,
-            "20": 7321333248.0,
-            "25": 7321333248.0,
-            "30": 7321333248.0,
-            "35": 7321333248.0,
-            "40": 7321333248.0,
-            "45": 7321333248.0,
-            "50": 7321333248.0
+            "1": 7316093440.0,
+            "2": 7316095488.0,
+            "3": 7316095488.0,
+            "4": 7316095488.0,
+            "5": 7316095488.0,
+            "6": 7316095488.0,
+            "7": 7316095488.0,
+            "8": 7316095488.0,
+            "9": 7316095488.0,
+            "10": 7316095488.0,
+            "11": 7316095488.0,
+            "12": 7316095488.0,
+            "13": 7316095488.0,
+            "14": 7316095488.0,
+            "15": 7316095488.0,
+            "16": 7316095488.0,
+            "17": 7316095488.0,
+            "18": 7316095488.0,
+            "19": 7316095488.0,
+            "20": 7316095488.0,
+            "21": 7316095488.0,
+            "22": 7316095488.0,
+            "23": 7316095488.0,
+            "24": 7316095488.0,
+            "25": 7316095488.0,
+            "26": 7316095488.0,
+            "27": 7316095488.0,
+            "28": 7316095488.0,
+            "29": 7316095488.0,
+            "30": 7316095488.0,
+            "31": 7316095488.0,
+            "32": 7316095488.0,
+            "33": 7316095488.0,
+            "34": 7316095488.0,
+            "35": 7316095488.0,
+            "36": 7316095488.0,
+            "37": 7316095488.0,
+            "38": 7316095488.0,
+            "39": 7316095488.0,
+            "40": 7316095488.0,
+            "41": 7316095488.0,
+            "42": 7316095488.0,
+            "43": 7316095488.0,
+            "44": 7316095488.0,
+            "45": 7316095488.0,
+            "46": 7316095488.0,
+            "47": 7316095488.0,
+            "48": 7316095488.0,
+            "49": 7316095488.0,
+            "50": 7316095488.0
         }
     },
     "mem-max-allocated-bytes": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 53176152064.0,
-            "5": 55926337536.0,
-            "10": 55926337536.0,
-            "15": 55926337536.0,
-            "20": 55926337536.0,
-            "25": 56534257664.0,
-            "30": 57393635328.0,
-            "35": 57393635328.0,
-            "40": 57578217472.0,
-            "45": 57578217472.0,
-            "50": 57578217472.0
+            "1": 53549867008.0,
+            "2": 56295710720.0,
+            "3": 56295710720.0,
+            "4": 56295710720.0,
+            "5": 56295710720.0,
+            "6": 56295710720.0,
+            "7": 56295710720.0,
+            "8": 56295710720.0,
+            "9": 56295710720.0,
+            "10": 56295710720.0,
+            "11": 56295710720.0,
+            "12": 56295710720.0,
+            "13": 56295710720.0,
+            "14": 56295710720.0,
+            "15": 56295710720.0,
+            "16": 56295710720.0,
+            "17": 56295710720.0,
+            "18": 56295710720.0,
+            "19": 56295710720.0,
+            "20": 56295710720.0,
+            "21": 56295710720.0,
+            "22": 56295710720.0,
+            "23": 56295710720.0,
+            "24": 56738553856.0,
+            "25": 56738553856.0,
+            "26": 56777162752.0,
+            "27": 56777162752.0,
+            "28": 56777162752.0,
+            "29": 56777162752.0,
+            "30": 56777162752.0,
+            "31": 56777162752.0,
+            "32": 56777162752.0,
+            "33": 56777162752.0,
+            "34": 56824344576.0,
+            "35": 57080135680.0,
+            "36": 57331695616.0,
+            "37": 57331695616.0,
+            "38": 57577013248.0,
+            "39": 57577013248.0,
+            "40": 57577013248.0,
+            "41": 57577013248.0,
+            "42": 57577013248.0,
+            "43": 57587191808.0,
+            "44": 57596944384.0,
+            "45": 57705652224.0,
+            "46": 57790390272.0,
+            "47": 57790390272.0,
+            "48": 57790390272.0,
+            "49": 57790390272.0,
+            "50": 57790390272.0
         }
     },
     "mtp_1 loss": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 11.0776,
-            "5": 9.87653,
-            "10": 9.02332,
-            "15": 7.91471,
-            "20": 7.75886,
-            "25": 7.56825,
-            "30": 7.53841,
-            "35": 7.12192,
-            "40": 7.44579,
-            "45": 7.09307,
-            "50": 6.94739
+            "1": 11.07756,
+            "2": 11.07651,
+            "3": 10.53063,
+            "4": 10.08611,
+            "5": 9.87524,
+            "6": 9.55366,
+            "7": 9.62345,
+            "8": 8.91012,
+            "9": 8.72228,
+            "10": 9.02504,
+            "11": 8.39501,
+            "12": 8.42504,
+            "13": 8.32334,
+            "14": 7.76976,
+            "15": 7.91789,
+            "16": 7.97018,
+            "17": 7.92051,
+            "18": 7.65266,
+            "19": 8.0377,
+            "20": 7.76074,
+            "21": 7.44752,
+            "22": 7.43657,
+            "23": 7.30984,
+            "24": 7.31186,
+            "25": 7.56562,
+            "26": 6.97201,
+            "27": 7.50933,
+            "28": 7.2266,
+            "29": 7.40633,
+            "30": 7.53569,
+            "31": 7.28904,
+            "32": 7.47424,
+            "33": 7.53526,
+            "34": 7.59404,
+            "35": 7.11968,
+            "36": 6.9867,
+            "37": 7.32338,
+            "38": 7.09605,
+            "39": 7.45524,
+            "40": 7.44706,
+            "41": 7.39271,
+            "42": 7.14573,
+            "43": 7.13128,
+            "44": 7.31399,
+            "45": 7.08836,
+            "46": 6.80158,
+            "47": 7.2062,
+            "48": 7.0468,
+            "49": 7.47982,
+            "50": 6.94494
         }
     },
     "iteration-time": {
         "start_step": 1,
         "end_step": 50,
-        "step_interval": 5,
+        "step_interval": 1,
         "values": {
-            "1": 51.33936,
-            "5": 1.24167,
-            "10": 1.14623,
-            "15": 1.16973,
-            "20": 1.23165,
-            "25": 1.13719,
-            "30": 1.15864,
-            "35": 1.13509,
-            "40": 1.14729,
-            "45": 1.14136,
-            "50": 1.13625
+            "1": 102.52307,
+            "2": 1.75305,
+            "3": 1.36681,
+            "4": 1.62808,
+            "5": 1.13714,
+            "6": 1.45805,
+            "7": 1.6121,
+            "8": 1.20031,
+            "9": 1.09784,
+            "10": 1.10383,
+            "11": 1.10878,
+            "12": 1.18093,
+            "13": 1.43808,
+            "14": 1.17223,
+            "15": 1.11575,
+            "16": 1.1159,
+            "17": 1.11727,
+            "18": 1.10751,
+            "19": 1.11189,
+            "20": 1.1082,
+            "21": 1.10459,
+            "22": 1.11252,
+            "23": 1.10744,
+            "24": 1.12218,
+            "25": 1.09823,
+            "26": 1.11657,
+            "27": 1.08949,
+            "28": 1.10254,
+            "29": 1.10189,
+            "30": 1.08963,
+            "31": 1.10454,
+            "32": 1.09654,
+            "33": 1.08747,
+            "34": 1.09674,
+            "35": 1.09106,
+            "36": 1.08904,
+            "37": 1.1178,
+            "38": 1.09379,
+            "39": 1.10306,
+            "40": 1.09998,
+            "41": 1.08808,
+            "42": 1.0941,
+            "43": 1.0919,
+            "44": 1.0813,
+            "45": 1.08715,
+            "46": 1.07061,
+            "47": 1.07098,
+            "48": 1.07438,
+            "49": 1.07469,
+            "50": 1.0719
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json
index 5e07fb3bfad..fd4a1a5bb30 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200.json
@@ -4,106 +4,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 10.81199,
-            "2": 10.82649,
-            "3": 10.81402,
-            "4": 10.79444,
-            "5": 10.8356,
-            "6": 10.84311,
-            "7": 10.83557,
-            "8": 10.83498,
-            "9": 10.83668,
-            "10": 10.78964,
-            "11": 10.85912,
-            "12": 10.84339,
-            "13": 10.84997,
-            "14": 10.86414,
-            "15": 10.80576,
-            "16": 10.7918,
-            "17": 10.76394,
-            "18": 10.78766,
-            "19": 10.78774,
-            "20": 10.70812,
-            "21": 10.6864,
-            "22": 10.53307,
-            "23": 10.69044,
-            "24": 10.5809,
-            "25": 10.52886,
-            "26": 10.57744,
-            "27": 10.58939,
-            "28": 10.56471,
-            "29": 10.56607,
-            "30": 10.35103,
-            "31": 10.09367,
-            "32": 10.43199,
-            "33": 10.43216,
-            "34": 10.19633,
-            "35": 10.23455,
-            "36": 10.19036,
-            "37": 10.31682,
-            "38": 10.16475,
-            "39": 10.3741,
-            "40": 10.05088,
-            "41": 10.10003,
-            "42": 10.17734,
-            "43": 9.79377,
-            "44": 9.91897,
-            "45": 9.79315,
-            "46": 9.78119,
-            "47": 10.10601,
-            "48": 9.81175,
-            "49": 9.4813,
-            "50": 9.86738,
-            "51": 9.80706,
-            "52": 9.70288,
-            "53": 10.03514,
-            "54": 9.92065,
-            "55": 9.84605,
-            "56": 9.58055,
-            "57": 9.43481,
-            "58": 9.79877,
-            "59": 9.54386,
-            "60": 9.4523,
-            "61": 9.65803,
-            "62": 9.95373,
-            "63": 9.34019,
-            "64": 9.73453,
-            "65": 8.90212,
-            "66": 9.66653,
-            "67": 9.33709,
-            "68": 9.75619,
-            "69": 9.77579,
-            "70": 9.70272,
-            "71": 9.60206,
-            "72": 9.54307,
-            "73": 9.4557,
-            "74": 8.87807,
-            "75": 9.37673,
-            "76": 9.03809,
-            "77": 10.03878,
-            "78": 9.69735,
-            "79": 9.35192,
-            "80": 9.37284,
-            "81": 9.45647,
-            "82": 9.67999,
-            "83": 9.27725,
-            "84": 9.39356,
-            "85": 9.58912,
-            "86": 9.05149,
-            "87": 9.57627,
-            "88": 9.72865,
-            "89": 9.5761,
-            "90": 9.80906,
-            "91": 9.30685,
-            "92": 9.33841,
-            "93": 9.05655,
-            "94": 8.80359,
-            "95": 9.50883,
-            "96": 9.50764,
-            "97": 9.27773,
-            "98": 9.65276,
-            "99": 8.87014,
-            "100": 9.38138
+            "1": 10.91904,
+            "2": 10.91571,
+            "3": 10.92092,
+            "4": 10.91263,
+            "5": 10.92017,
+            "6": 10.91251,
+            "7": 10.90273,
+            "8": 10.90946,
+            "9": 10.91875,
+            "10": 10.9129,
+            "11": 10.89482,
+            "12": 10.90118,
+            "13": 10.8949,
+            "14": 10.88931,
+            "15": 10.87879,
+            "16": 10.86005,
+            "17": 10.86647,
+            "18": 10.85501,
+            "19": 10.85645,
+            "20": 10.79526,
+            "21": 10.76943,
+            "22": 10.75442,
+            "23": 10.75546,
+            "24": 10.71717,
+            "25": 10.7161,
+            "26": 10.69524,
+            "27": 10.66656,
+            "28": 10.60162,
+            "29": 10.56716,
+            "30": 10.54165,
+            "31": 10.53805,
+            "32": 10.52155,
+            "33": 10.48913,
+            "34": 10.45639,
+            "35": 10.45159,
+            "36": 10.43122,
+            "37": 10.39793,
+            "38": 10.40494,
+            "39": 10.3669,
+            "40": 10.34742,
+            "41": 10.33408,
+            "42": 10.31732,
+            "43": 10.29406,
+            "44": 10.25533,
+            "45": 10.27595,
+            "46": 10.23783,
+            "47": 10.22136,
+            "48": 10.18248,
+            "49": 10.17993,
+            "50": 10.18122,
+            "51": 10.18357,
+            "52": 10.13801,
+            "53": 10.14673,
+            "54": 10.10855,
+            "55": 10.08907,
+            "56": 10.10825,
+            "57": 10.10375,
+            "58": 10.11146,
+            "59": 10.06199,
+            "60": 10.08053,
+            "61": 10.03534,
+            "62": 10.00906,
+            "63": 10.07699,
+            "64": 10.03573,
+            "65": 10.01016,
+            "66": 10.0313,
+            "67": 10.01163,
+            "68": 9.97607,
+            "69": 9.99302,
+            "70": 9.97488,
+            "71": 10.00448,
+            "72": 9.97663,
+            "73": 9.96764,
+            "74": 9.95,
+            "75": 9.92691,
+            "76": 9.96253,
+            "77": 9.95211,
+            "78": 9.89495,
+            "79": 9.90013,
+            "80": 9.91207,
+            "81": 9.94089,
+            "82": 9.88436,
+            "83": 9.83821,
+            "84": 9.77882,
+            "85": 9.76545,
+            "86": 9.8708,
+            "87": 9.89822,
+            "88": 9.86929,
+            "89": 9.81258,
+            "90": 9.79994,
+            "91": 9.81216,
+            "92": 9.80689,
+            "93": 9.74583,
+            "94": 9.81719,
+            "95": 9.8138,
+            "96": 9.79844,
+            "97": 9.7308,
+            "98": 9.77083,
+            "99": 9.81473,
+            "100": 9.7054
         }
     },
     "num-zeros": {
@@ -111,106 +111,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 26523.0,
-            "2": 29472.0,
-            "3": 29140.0,
-            "4": 28787.0,
-            "5": 32154.0,
-            "6": 33150.0,
-            "7": 31503.0,
-            "8": 27488.0,
-            "9": 30851.0,
-            "10": 25539.0,
-            "11": 33735.0,
-            "12": 30721.0,
-            "13": 33360.0,
-            "14": 33374.0,
-            "15": 30838.0,
-            "16": 32360.0,
-            "17": 31588.0,
-            "18": 31016.0,
-            "19": 31320.0,
-            "20": 28419.0,
-            "21": 29325.0,
-            "22": 27567.0,
-            "23": 34221.0,
-            "24": 28953.0,
-            "25": 27716.0,
-            "26": 31399.0,
-            "27": 31596.0,
-            "28": 33689.0,
-            "29": 35335.0,
-            "30": 30311.0,
-            "31": 27380.0,
-            "32": 33651.0,
-            "33": 36080.0,
-            "34": 30178.0,
-            "35": 31974.0,
-            "36": 32609.0,
-            "37": 36504.0,
-            "38": 33985.0,
-            "39": 38040.0,
-            "40": 34901.0,
-            "41": 33629.0,
-            "42": 35751.0,
-            "43": 32869.0,
-            "44": 32821.0,
-            "45": 34054.0,
-            "46": 34067.0,
-            "47": 40493.0,
-            "48": 35146.0,
-            "49": 34756.0,
-            "50": 38467.0,
-            "51": 36706.0,
-            "52": 36171.0,
-            "53": 42698.0,
-            "54": 41471.0,
-            "55": 38216.0,
-            "56": 41916.0,
-            "57": 36330.0,
-            "58": 41283.0,
-            "59": 40081.0,
-            "60": 55891.0,
-            "61": 59793.0,
-            "62": 2137191.0,
-            "63": 36446.0,
-            "64": 128493.0,
-            "65": 43769.0,
-            "66": 2139269.0,
-            "67": 2137293.0,
-            "68": 2136798.0,
-            "69": 2139311.0,
-            "70": 2140379.0,
-            "71": 2138932.0,
-            "72": 2138654.0,
-            "73": 2141565.0,
-            "74": 2137087.0,
-            "75": 2137011.0,
-            "76": 2140501.0,
-            "77": 2140898.0,
-            "78": 2142043.0,
-            "79": 2142782.0,
-            "80": 2141568.0,
-            "81": 2145750.0,
-            "82": 2144812.0,
-            "83": 2141262.0,
-            "84": 2140595.0,
-            "85": 2145583.0,
-            "86": 2140562.0,
-            "87": 2144769.0,
-            "88": 2142291.0,
-            "89": 2140641.0,
-            "90": 2144794.0,
-            "91": 2143597.0,
-            "92": 2141696.0,
-            "93": 2139704.0,
-            "94": 2145839.0,
-            "95": 2142840.0,
-            "96": 2145981.0,
-            "97": 2140183.0,
-            "98": 2143585.0,
-            "99": 2143931.0,
-            "100": 2142923.0
+            "1": 31669.0,
+            "2": 32700.0,
+            "3": 32349.0,
+            "4": 32157.0,
+            "5": 32347.0,
+            "6": 31532.0,
+            "7": 32423.0,
+            "8": 31994.0,
+            "9": 32442.0,
+            "10": 32920.0,
+            "11": 32509.0,
+            "12": 31413.0,
+            "13": 32556.0,
+            "14": 32530.0,
+            "15": 31531.0,
+            "16": 31236.0,
+            "17": 32473.0,
+            "18": 32343.0,
+            "19": 32357.0,
+            "20": 32740.0,
+            "21": 32381.0,
+            "22": 33007.0,
+            "23": 33012.0,
+            "24": 33421.0,
+            "25": 31671.0,
+            "26": 33468.0,
+            "27": 32807.0,
+            "28": 32501.0,
+            "29": 32952.0,
+            "30": 33493.0,
+            "31": 34743.0,
+            "32": 34941.0,
+            "33": 34597.0,
+            "34": 34650.0,
+            "35": 35582.0,
+            "36": 34748.0,
+            "37": 36094.0,
+            "38": 34719.0,
+            "39": 36470.0,
+            "40": 37447.0,
+            "41": 36207.0,
+            "42": 43139.0,
+            "43": 48341.0,
+            "44": 133194.0,
+            "45": 148685.0,
+            "46": 2144590.0,
+            "47": 2254471.0,
+            "48": 2138081.0,
+            "49": 2242903.0,
+            "50": 2155011.0,
+            "51": 2222774.0,
+            "52": 2254396.0,
+            "53": 2230278.0,
+            "54": 4237503.0,
+            "55": 2252607.0,
+            "56": 2241594.0,
+            "57": 4229311.0,
+            "58": 2252850.0,
+            "59": 2249962.0,
+            "60": 2239587.0,
+            "61": 4236257.0,
+            "62": 4239445.0,
+            "63": 4248528.0,
+            "64": 4242624.0,
+            "65": 4234629.0,
+            "66": 4237583.0,
+            "67": 4268339.0,
+            "68": 4239062.0,
+            "69": 4237275.0,
+            "70": 4237689.0,
+            "71": 4237172.0,
+            "72": 4237903.0,
+            "73": 2211616.0,
+            "74": 4235627.0,
+            "75": 4237297.0,
+            "76": 2225851.0,
+            "77": 4238129.0,
+            "78": 4239495.0,
+            "79": 2268212.0,
+            "80": 2242352.0,
+            "81": 4237785.0,
+            "82": 253952.0,
+            "83": 2265526.0,
+            "84": 2146594.0,
+            "85": 2212331.0,
+            "86": 2250024.0,
+            "87": 4235496.0,
+            "88": 2236401.0,
+            "89": 2155282.0,
+            "90": 171831.0,
+            "91": 2155804.0,
+            "92": 39235.0,
+            "93": 157576.0,
+            "94": 60717.0,
+            "95": 2140994.0,
+            "96": 2140566.0,
+            "97": 2150987.0,
+            "98": 127906.0,
+            "99": 54570.0,
+            "100": 2142700.0
         }
     },
     "mem-allocated-bytes": {
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 787591680.0,
-            "2": 787578880.0,
-            "3": 787594240.0,
-            "4": 787568128.0,
-            "5": 787563008.0,
-            "6": 787585536.0,
-            "7": 787578880.0,
-            "8": 787582464.0,
-            "9": 787581952.0,
-            "10": 787591680.0,
-            "11": 787569152.0,
-            "12": 787570176.0,
-            "13": 787579392.0,
-            "14": 787582464.0,
-            "15": 787566592.0,
-            "16": 787572736.0,
-            "17": 787567104.0,
-            "18": 787546624.0,
-            "19": 787567104.0,
-            "20": 787536384.0,
-            "21": 787540992.0,
-            "22": 787541504.0,
-            "23": 787549696.0,
-            "24": 787540992.0,
-            "25": 787533824.0,
-            "26": 787548672.0,
-            "27": 787510784.0,
+            "1": 787582464.0,
+            "2": 787569152.0,
+            "3": 787573248.0,
+            "4": 787560960.0,
+            "5": 787587584.0,
+            "6": 787579904.0,
+            "7": 787585024.0,
+            "8": 787584512.0,
+            "9": 787577344.0,
+            "10": 787568640.0,
+            "11": 787561984.0,
+            "12": 787573760.0,
+            "13": 787582976.0,
+            "14": 787571200.0,
+            "15": 787573760.0,
+            "16": 787566592.0,
+            "17": 787563520.0,
+            "18": 787553280.0,
+            "19": 787569664.0,
+            "20": 787535872.0,
+            "21": 787532800.0,
+            "22": 787522560.0,
+            "23": 787521024.0,
+            "24": 787528704.0,
+            "25": 787526144.0,
+            "26": 787515904.0,
+            "27": 787510272.0,
             "28": 787505152.0,
-            "29": 787500544.0,
-            "30": 787493376.0,
-            "31": 787511808.0,
-            "32": 787501568.0,
-            "33": 787482624.0,
-            "34": 787486208.0,
-            "35": 787483136.0,
-            "36": 787484160.0,
-            "37": 787461120.0,
-            "38": 787457024.0,
-            "39": 787462144.0,
-            "40": 787456512.0,
-            "41": 787467264.0,
-            "42": 787431936.0,
-            "43": 787449856.0,
-            "44": 787436032.0,
-            "45": 787411968.0,
-            "46": 787460096.0,
-            "47": 787413504.0,
-            "48": 787440128.0,
-            "49": 787410432.0,
-            "50": 787396096.0,
-            "51": 787388928.0,
-            "52": 787413504.0,
-            "53": 787377152.0,
-            "54": 787404288.0,
-            "55": 787375104.0,
-            "56": 787362304.0,
-            "57": 787405824.0,
-            "58": 787355648.0,
-            "59": 787378176.0,
-            "60": 787379712.0,
-            "61": 787339264.0,
-            "62": 787331072.0,
-            "63": 787369472.0,
-            "64": 787339264.0,
-            "65": 787403776.0,
-            "66": 787329024.0,
-            "67": 787337216.0,
-            "68": 787323904.0,
-            "69": 787335168.0,
-            "70": 787329536.0,
-            "71": 787331072.0,
-            "72": 787341824.0,
-            "73": 787351552.0,
-            "74": 787365376.0,
-            "75": 787343360.0,
-            "76": 787343872.0,
-            "77": 787344896.0,
-            "78": 787371520.0,
-            "79": 787366912.0,
-            "80": 787387904.0,
-            "81": 787384832.0,
-            "82": 787393536.0,
-            "83": 787403264.0,
-            "84": 787397632.0,
-            "85": 787397120.0,
-            "86": 787410432.0,
-            "87": 787389952.0,
-            "88": 787387392.0,
-            "89": 787400192.0,
-            "90": 787379200.0,
-            "91": 787400192.0,
-            "92": 787397632.0,
-            "93": 787390976.0,
-            "94": 787393024.0,
-            "95": 787398656.0,
-            "96": 787397120.0,
-            "97": 787403776.0,
-            "98": 787398144.0,
-            "99": 787408896.0,
-            "100": 787411968.0
+            "29": 787501568.0,
+            "30": 787482112.0,
+            "31": 787488768.0,
+            "32": 787486208.0,
+            "33": 787490816.0,
+            "34": 787476992.0,
+            "35": 787472384.0,
+            "36": 787466240.0,
+            "37": 787457024.0,
+            "38": 787464704.0,
+            "39": 787448320.0,
+            "40": 787445248.0,
+            "41": 787447296.0,
+            "42": 787424256.0,
+            "43": 787424768.0,
+            "44": 787417088.0,
+            "45": 787401216.0,
+            "46": 787384832.0,
+            "47": 787381248.0,
+            "48": 787385344.0,
+            "49": 787369472.0,
+            "50": 787362816.0,
+            "51": 787367936.0,
+            "52": 787363328.0,
+            "53": 787363840.0,
+            "54": 787364352.0,
+            "55": 787363328.0,
+            "56": 787361280.0,
+            "57": 787373056.0,
+            "58": 787368960.0,
+            "59": 787363328.0,
+            "60": 787378688.0,
+            "61": 787365376.0,
+            "62": 787369472.0,
+            "63": 787372032.0,
+            "64": 787369984.0,
+            "65": 787375104.0,
+            "66": 787365888.0,
+            "67": 787370496.0,
+            "68": 787376128.0,
+            "69": 787379200.0,
+            "70": 787380736.0,
+            "71": 787378176.0,
+            "72": 787383296.0,
+            "73": 787386880.0,
+            "74": 787383808.0,
+            "75": 787384320.0,
+            "76": 787388928.0,
+            "77": 787406848.0,
+            "78": 787406848.0,
+            "79": 787397120.0,
+            "80": 787408896.0,
+            "81": 787419648.0,
+            "82": 787425280.0,
+            "83": 787414528.0,
+            "84": 787438080.0,
+            "85": 787434496.0,
+            "86": 787440640.0,
+            "87": 787422208.0,
+            "88": 787412992.0,
+            "89": 787431424.0,
+            "90": 787455488.0,
+            "91": 787439104.0,
+            "92": 787461120.0,
+            "93": 787444736.0,
+            "94": 787449344.0,
+            "95": 787457024.0,
+            "96": 787442176.0,
+            "97": 787452416.0,
+            "98": 787451904.0,
+            "99": 787429376.0,
+            "100": 787459584.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 2662647296.0,
-            "2": 2662647296.0,
-            "3": 2665063424.0,
-            "4": 2665063424.0,
-            "5": 2665063424.0,
-            "6": 2665063424.0,
-            "7": 2665063424.0,
-            "8": 2665063424.0,
-            "9": 2665063424.0,
-            "10": 2665063424.0,
-            "11": 2665063424.0,
-            "12": 2665063424.0,
-            "13": 2665063424.0,
-            "14": 2665063424.0,
-            "15": 2665063424.0,
-            "16": 2665063424.0,
-            "17": 2665063424.0,
-            "18": 2665063424.0,
-            "19": 2665063424.0,
-            "20": 2665063424.0,
-            "21": 2665063424.0,
-            "22": 2665063424.0,
-            "23": 2665063424.0,
-            "24": 2665063424.0,
-            "25": 2665063424.0,
-            "26": 2665063424.0,
-            "27": 2665063424.0,
-            "28": 2665063424.0,
-            "29": 2665063424.0,
-            "30": 2665063424.0,
-            "31": 2665063424.0,
-            "32": 2665063424.0,
-            "33": 2665063424.0,
-            "34": 2665063424.0,
-            "35": 2665063424.0,
-            "36": 2665063424.0,
-            "37": 2665063424.0,
-            "38": 2665063424.0,
-            "39": 2665063424.0,
-            "40": 2665063424.0,
-            "41": 2665063424.0,
-            "42": 2665063424.0,
-            "43": 2665063424.0,
-            "44": 2665063424.0,
-            "45": 2665063424.0,
-            "46": 2665063424.0,
-            "47": 2665063424.0,
-            "48": 2665063424.0,
-            "49": 2665063424.0,
-            "50": 2665063424.0,
-            "51": 2665063424.0,
-            "52": 2665063424.0,
-            "53": 2665063424.0,
-            "54": 2665063424.0,
-            "55": 2665063424.0,
-            "56": 2665063424.0,
-            "57": 2665063424.0,
-            "58": 2665063424.0,
-            "59": 2665063424.0,
-            "60": 2665063424.0,
-            "61": 2665063424.0,
-            "62": 2665063424.0,
-            "63": 2665063424.0,
-            "64": 2665063424.0,
-            "65": 2665063424.0,
-            "66": 2665063424.0,
-            "67": 2665063424.0,
-            "68": 2665063424.0,
-            "69": 2665063424.0,
-            "70": 2665063424.0,
-            "71": 2665063424.0,
-            "72": 2665063424.0,
-            "73": 2665063424.0,
-            "74": 2665063424.0,
-            "75": 2665063424.0,
-            "76": 2665063424.0,
-            "77": 2665063424.0,
-            "78": 2665063424.0,
-            "79": 2665063424.0,
-            "80": 2665063424.0,
-            "81": 2665063424.0,
-            "82": 2665063424.0,
-            "83": 2665063424.0,
-            "84": 2665063424.0,
-            "85": 2665063424.0,
-            "86": 2665063424.0,
-            "87": 2665063424.0,
-            "88": 2665063424.0,
-            "89": 2665063424.0,
-            "90": 2665063424.0,
-            "91": 2665063424.0,
-            "92": 2665063424.0,
-            "93": 2665063424.0,
-            "94": 2665063424.0,
-            "95": 2665063424.0,
-            "96": 2665063424.0,
-            "97": 2665063424.0,
-            "98": 2665063424.0,
-            "99": 2665063424.0,
-            "100": 2665063424.0
+            "1": 2657495552.0,
+            "2": 2657495552.0,
+            "3": 2657495552.0,
+            "4": 2657495552.0,
+            "5": 2659698176.0,
+            "6": 2659698176.0,
+            "7": 2659698176.0,
+            "8": 2659698176.0,
+            "9": 2659698176.0,
+            "10": 2659698176.0,
+            "11": 2659698176.0,
+            "12": 2664887808.0,
+            "13": 2664887808.0,
+            "14": 2664887808.0,
+            "15": 2664887808.0,
+            "16": 2664887808.0,
+            "17": 2664887808.0,
+            "18": 2664887808.0,
+            "19": 2664887808.0,
+            "20": 2664887808.0,
+            "21": 2664887808.0,
+            "22": 2664887808.0,
+            "23": 2664887808.0,
+            "24": 2664887808.0,
+            "25": 2664887808.0,
+            "26": 2664887808.0,
+            "27": 2664887808.0,
+            "28": 2664887808.0,
+            "29": 2664887808.0,
+            "30": 2664887808.0,
+            "31": 2664887808.0,
+            "32": 2664887808.0,
+            "33": 2664887808.0,
+            "34": 2664887808.0,
+            "35": 2664887808.0,
+            "36": 2664887808.0,
+            "37": 2664887808.0,
+            "38": 2664887808.0,
+            "39": 2664887808.0,
+            "40": 2664887808.0,
+            "41": 2664887808.0,
+            "42": 2664887808.0,
+            "43": 2664887808.0,
+            "44": 2664887808.0,
+            "45": 2664887808.0,
+            "46": 2664887808.0,
+            "47": 2664887808.0,
+            "48": 2664887808.0,
+            "49": 2664887808.0,
+            "50": 2664887808.0,
+            "51": 2664887808.0,
+            "52": 2664887808.0,
+            "53": 2664887808.0,
+            "54": 2664887808.0,
+            "55": 2664887808.0,
+            "56": 2664887808.0,
+            "57": 2664887808.0,
+            "58": 2664887808.0,
+            "59": 2664887808.0,
+            "60": 2664887808.0,
+            "61": 2664887808.0,
+            "62": 2664887808.0,
+            "63": 2664887808.0,
+            "64": 2664887808.0,
+            "65": 2664887808.0,
+            "66": 2664887808.0,
+            "67": 2664887808.0,
+            "68": 2664887808.0,
+            "69": 2664887808.0,
+            "70": 2664887808.0,
+            "71": 2664887808.0,
+            "72": 2664887808.0,
+            "73": 2664887808.0,
+            "74": 2664887808.0,
+            "75": 2664887808.0,
+            "76": 2664887808.0,
+            "77": 2664887808.0,
+            "78": 2664887808.0,
+            "79": 2664887808.0,
+            "80": 2664887808.0,
+            "81": 2664887808.0,
+            "82": 2664887808.0,
+            "83": 2664887808.0,
+            "84": 2664887808.0,
+            "85": 2664887808.0,
+            "86": 2664887808.0,
+            "87": 2664887808.0,
+            "88": 2664887808.0,
+            "89": 2664887808.0,
+            "90": 2664887808.0,
+            "91": 2664887808.0,
+            "92": 2664887808.0,
+            "93": 2664887808.0,
+            "94": 2664887808.0,
+            "95": 2664887808.0,
+            "96": 2664887808.0,
+            "97": 2664887808.0,
+            "98": 2664887808.0,
+            "99": 2664887808.0,
+            "100": 2664887808.0
         }
     },
     "iteration-time": {
@@ -433,105 +433,105 @@
         "step_interval": 1,
         "values": {
             "1": "nan",
-            "2": 5.98635,
-            "3": 0.52282,
-            "4": 0.49693,
-            "5": 0.4928,
-            "6": 0.48259,
-            "7": 0.4965,
-            "8": 0.47878,
-            "9": 0.47293,
-            "10": 0.47259,
-            "11": 0.47135,
-            "12": 0.47375,
-            "13": 0.46469,
-            "14": 0.4653,
-            "15": 0.47382,
-            "16": 0.48208,
-            "17": 0.47932,
-            "18": 0.46393,
-            "19": 0.46346,
-            "20": 0.47236,
-            "21": 0.4714,
-            "22": 0.47499,
-            "23": 0.47258,
-            "24": 0.46914,
-            "25": 0.47024,
-            "26": 0.46574,
-            "27": 0.47482,
-            "28": 0.47982,
-            "29": 0.48899,
-            "30": 0.49411,
-            "31": 0.48791,
-            "32": 0.48868,
-            "33": 0.48565,
-            "34": 0.48033,
-            "35": 0.48225,
-            "36": 0.47838,
-            "37": 0.48688,
-            "38": 0.48265,
-            "39": 0.48609,
-            "40": 0.48829,
-            "41": 0.48993,
-            "42": 0.49163,
-            "43": 0.48738,
-            "44": 0.48033,
-            "45": 0.48298,
-            "46": 0.49224,
-            "47": 0.47934,
-            "48": 0.48869,
-            "49": 0.47492,
-            "50": 0.47463,
-            "51": 0.49442,
-            "52": 0.4729,
-            "53": 0.47381,
-            "54": 0.47741,
-            "55": 0.48415,
-            "56": 0.48472,
-            "57": 0.49879,
-            "58": 0.48585,
-            "59": 0.49378,
-            "60": 0.49224,
-            "61": 0.48445,
-            "62": 0.47883,
-            "63": 0.48658,
-            "64": 0.48416,
-            "65": 0.47652,
-            "66": 0.47867,
-            "67": 0.5028,
-            "68": 0.48553,
-            "69": 0.48415,
-            "70": 0.47946,
-            "71": 0.47869,
-            "72": 0.47973,
-            "73": 0.48056,
-            "74": 0.48003,
-            "75": 0.48769,
-            "76": 0.4697,
-            "77": 0.47534,
-            "78": 0.46682,
-            "79": 0.47552,
-            "80": 0.47839,
-            "81": 0.48653,
-            "82": 0.48245,
-            "83": 0.48713,
-            "84": 0.4737,
-            "85": 0.47339,
-            "86": 0.47528,
-            "87": 0.48514,
-            "88": 0.47048,
-            "89": 0.47146,
-            "90": 0.81332,
-            "91": 0.4747,
-            "92": 0.47449,
-            "93": 0.47825,
-            "94": 0.47459,
-            "95": 0.47757,
-            "96": 0.47444,
-            "97": 0.46924,
-            "98": 0.47068,
-            "99": 0.47128,
-            "100": 0.47481
+            "2": 5.71796,
+            "3": 0.50186,
+            "4": 0.50306,
+            "5": 0.5085,
+            "6": 0.50678,
+            "7": 0.4904,
+            "8": 0.48018,
+            "9": 0.49025,
+            "10": 0.47283,
+            "11": 0.49137,
+            "12": 0.47486,
+            "13": 0.47709,
+            "14": 0.47499,
+            "15": 0.47898,
+            "16": 0.47425,
+            "17": 0.48175,
+            "18": 0.47372,
+            "19": 0.4817,
+            "20": 0.48774,
+            "21": 0.49082,
+            "22": 0.47964,
+            "23": 0.49299,
+            "24": 0.47607,
+            "25": 0.48242,
+            "26": 0.4836,
+            "27": 0.48117,
+            "28": 0.49828,
+            "29": 0.49164,
+            "30": 0.48724,
+            "31": 0.48704,
+            "32": 0.48833,
+            "33": 0.49502,
+            "34": 0.49441,
+            "35": 0.49488,
+            "36": 0.50063,
+            "37": 0.49161,
+            "38": 0.4956,
+            "39": 0.49242,
+            "40": 0.48551,
+            "41": 0.48067,
+            "42": 0.4913,
+            "43": 0.49029,
+            "44": 0.48554,
+            "45": 0.48842,
+            "46": 0.48198,
+            "47": 0.47871,
+            "48": 0.48152,
+            "49": 0.47483,
+            "50": 0.48111,
+            "51": 0.73498,
+            "52": 0.48088,
+            "53": 0.47221,
+            "54": 0.47759,
+            "55": 0.48061,
+            "56": 0.47758,
+            "57": 0.47659,
+            "58": 0.48066,
+            "59": 0.48258,
+            "60": 0.47935,
+            "61": 0.48291,
+            "62": 0.48165,
+            "63": 0.48219,
+            "64": 0.4821,
+            "65": 0.4805,
+            "66": 0.48691,
+            "67": 0.48324,
+            "68": 0.48693,
+            "69": 0.49669,
+            "70": 0.48476,
+            "71": 0.48074,
+            "72": 0.48791,
+            "73": 0.48117,
+            "74": 0.48262,
+            "75": 0.48424,
+            "76": 0.48338,
+            "77": 0.4801,
+            "78": 0.48198,
+            "79": 0.4832,
+            "80": 0.48239,
+            "81": 0.48493,
+            "82": 0.48114,
+            "83": 0.48639,
+            "84": 0.48251,
+            "85": 0.4826,
+            "86": 0.49605,
+            "87": 0.49461,
+            "88": 0.48091,
+            "89": 0.48935,
+            "90": 0.48226,
+            "91": 0.4833,
+            "92": 0.48895,
+            "93": 0.48578,
+            "94": 0.48657,
+            "95": 0.48407,
+            "96": 0.48632,
+            "97": 0.48648,
+            "98": 0.48229,
+            "99": 0.47617,
+            "100": 0.48011
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200_2nd.json
new file mode 100644
index 00000000000..0fa278f84f1
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe/gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router/golden_values_dev_dgx_gb200_2nd.json
@@ -0,0 +1,537 @@
+{
+    "lm loss": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 10.18357,
+            "52": 10.13801,
+            "53": 10.14673,
+            "54": 10.10855,
+            "55": 10.08907,
+            "56": 10.10825,
+            "57": 10.10375,
+            "58": 10.11146,
+            "59": 10.06199,
+            "60": 10.08053,
+            "61": 10.03534,
+            "62": 10.00906,
+            "63": 10.07699,
+            "64": 10.03573,
+            "65": 10.01016,
+            "66": 10.0313,
+            "67": 10.01163,
+            "68": 9.97607,
+            "69": 9.99302,
+            "70": 9.97488,
+            "71": 10.00448,
+            "72": 9.97663,
+            "73": 9.96764,
+            "74": 9.95,
+            "75": 9.92691,
+            "76": 9.96253,
+            "77": 9.95211,
+            "78": 9.89495,
+            "79": 9.90013,
+            "80": 9.91207,
+            "81": 9.94089,
+            "82": 9.88436,
+            "83": 9.83821,
+            "84": 9.77882,
+            "85": 9.76545,
+            "86": 9.8708,
+            "87": 9.89822,
+            "88": 9.86929,
+            "89": 9.81258,
+            "90": 9.79994,
+            "91": 9.81216,
+            "92": 9.80689,
+            "93": 9.74583,
+            "94": 9.81719,
+            "95": 9.8138,
+            "96": 9.79844,
+            "97": 9.7308,
+            "98": 9.77083,
+            "99": 9.81473,
+            "100": 9.7054
+        }
+    },
+    "num-zeros": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2222774.0,
+            "52": 2254396.0,
+            "53": 2230278.0,
+            "54": 4237503.0,
+            "55": 2252607.0,
+            "56": 2241594.0,
+            "57": 4229311.0,
+            "58": 2252850.0,
+            "59": 2249962.0,
+            "60": 2239587.0,
+            "61": 4236257.0,
+            "62": 4239445.0,
+            "63": 4248528.0,
+            "64": 4242624.0,
+            "65": 4234629.0,
+            "66": 4237583.0,
+            "67": 4268339.0,
+            "68": 4239062.0,
+            "69": 4237275.0,
+            "70": 4237689.0,
+            "71": 4237172.0,
+            "72": 4237903.0,
+            "73": 2211616.0,
+            "74": 4235627.0,
+            "75": 4237297.0,
+            "76": 2225851.0,
+            "77": 4238129.0,
+            "78": 4239495.0,
+            "79": 2268212.0,
+            "80": 2242352.0,
+            "81": 4237785.0,
+            "82": 253952.0,
+            "83": 2265526.0,
+            "84": 2146594.0,
+            "85": 2212331.0,
+            "86": 2250024.0,
+            "87": 4235496.0,
+            "88": 2236401.0,
+            "89": 2155282.0,
+            "90": 171831.0,
+            "91": 2155804.0,
+            "92": 39235.0,
+            "93": 157576.0,
+            "94": 60717.0,
+            "95": 2140994.0,
+            "96": 2140566.0,
+            "97": 2150987.0,
+            "98": 127906.0,
+            "99": 54570.0,
+            "100": 2142700.0
+        }
+    },
+    "mem-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 787367424.0,
+            "52": 787363328.0,
+            "53": 787363840.0,
+            "54": 787364352.0,
+            "55": 787363328.0,
+            "56": 787361280.0,
+            "57": 787373056.0,
+            "58": 787368960.0,
+            "59": 787363328.0,
+            "60": 787378688.0,
+            "61": 787365376.0,
+            "62": 787369472.0,
+            "63": 787372032.0,
+            "64": 787369984.0,
+            "65": 787375104.0,
+            "66": 787365888.0,
+            "67": 787370496.0,
+            "68": 787376128.0,
+            "69": 787379200.0,
+            "70": 787380736.0,
+            "71": 787378176.0,
+            "72": 787383296.0,
+            "73": 787386880.0,
+            "74": 787383808.0,
+            "75": 787384320.0,
+            "76": 787388928.0,
+            "77": 787406848.0,
+            "78": 787406848.0,
+            "79": 787397120.0,
+            "80": 787408896.0,
+            "81": 787419648.0,
+            "82": 787425280.0,
+            "83": 787414528.0,
+            "84": 787438080.0,
+            "85": 787434496.0,
+            "86": 787440640.0,
+            "87": 787422208.0,
+            "88": 787412992.0,
+            "89": 787431424.0,
+            "90": 787455488.0,
+            "91": 787439104.0,
+            "92": 787461120.0,
+            "93": 787444736.0,
+            "94": 787449344.0,
+            "95": 787457024.0,
+            "96": 787442176.0,
+            "97": 787452416.0,
+            "98": 787451904.0,
+            "99": 787429376.0,
+            "100": 787459584.0
+        }
+    },
+    "mem-max-allocated-bytes": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": 2477707776.0,
+            "52": 2477707776.0,
+            "53": 2477707776.0,
+            "54": 2477707776.0,
+            "55": 2477707776.0,
+            "56": 2477707776.0,
+            "57": 2477707776.0,
+            "58": 2480768000.0,
+            "59": 2480768000.0,
+            "60": 2482860544.0,
+            "61": 2482860544.0,
+            "62": 2482860544.0,
+            "63": 2482860544.0,
+            "64": 2482860544.0,
+            "65": 2482860544.0,
+            "66": 2482860544.0,
+            "67": 2482860544.0,
+            "68": 2482860544.0,
+            "69": 2499616256.0,
+            "70": 2499616256.0,
+            "71": 2499616256.0,
+            "72": 2499616256.0,
+            "73": 2499616256.0,
+            "74": 2499616256.0,
+            "75": 2506918912.0,
+            "76": 2506918912.0,
+            "77": 2506918912.0,
+            "78": 2511425024.0,
+            "79": 2511425024.0,
+            "80": 2517168640.0,
+            "81": 2519074304.0,
+            "82": 2521834496.0,
+            "83": 2521834496.0,
+            "84": 2532288000.0,
+            "85": 2536407040.0,
+            "86": 2539149824.0,
+            "87": 2539149824.0,
+            "88": 2539486720.0,
+            "89": 2539486720.0,
+            "90": 2545446400.0,
+            "91": 2545446400.0,
+            "92": 2550150144.0,
+            "93": 2550150144.0,
+            "94": 2550150144.0,
+            "95": 2550150144.0,
+            "96": 2550150144.0,
+            "97": 2550150144.0,
+            "98": 2550150144.0,
+            "99": 2550150144.0,
+            "100": 2551760384.0
+        }
+    },
+    "iteration-time": {
+        "start_step": 1,
+        "end_step": 100,
+        "step_interval": 1,
+        "values": {
+            "1": "nan",
+            "2": "nan",
+            "3": "nan",
+            "4": "nan",
+            "5": "nan",
+            "6": "nan",
+            "7": "nan",
+            "8": "nan",
+            "9": "nan",
+            "10": "nan",
+            "11": "nan",
+            "12": "nan",
+            "13": "nan",
+            "14": "nan",
+            "15": "nan",
+            "16": "nan",
+            "17": "nan",
+            "18": "nan",
+            "19": "nan",
+            "20": "nan",
+            "21": "nan",
+            "22": "nan",
+            "23": "nan",
+            "24": "nan",
+            "25": "nan",
+            "26": "nan",
+            "27": "nan",
+            "28": "nan",
+            "29": "nan",
+            "30": "nan",
+            "31": "nan",
+            "32": "nan",
+            "33": "nan",
+            "34": "nan",
+            "35": "nan",
+            "36": "nan",
+            "37": "nan",
+            "38": "nan",
+            "39": "nan",
+            "40": "nan",
+            "41": "nan",
+            "42": "nan",
+            "43": "nan",
+            "44": "nan",
+            "45": "nan",
+            "46": "nan",
+            "47": "nan",
+            "48": "nan",
+            "49": "nan",
+            "50": "nan",
+            "51": "nan",
+            "52": 5.98438,
+            "53": 0.53159,
+            "54": 0.51051,
+            "55": 0.50466,
+            "56": 0.49593,
+            "57": 0.50367,
+            "58": 0.48608,
+            "59": 0.48722,
+            "60": 0.48968,
+            "61": 0.49171,
+            "62": 0.49333,
+            "63": 0.49418,
+            "64": 0.49287,
+            "65": 0.48898,
+            "66": 0.48714,
+            "67": 0.48775,
+            "68": 0.49172,
+            "69": 0.48857,
+            "70": 0.49344,
+            "71": 0.49161,
+            "72": 0.49077,
+            "73": 0.48402,
+            "74": 0.49046,
+            "75": 0.48655,
+            "76": 0.48757,
+            "77": 0.48354,
+            "78": 0.47967,
+            "79": 0.48495,
+            "80": 0.4884,
+            "81": 0.49401,
+            "82": 0.48801,
+            "83": 0.4914,
+            "84": 0.48774,
+            "85": 0.48532,
+            "86": 0.49525,
+            "87": 0.49418,
+            "88": 0.483,
+            "89": 0.49391,
+            "90": 0.48927,
+            "91": 0.48905,
+            "92": 0.48991,
+            "93": 0.48937,
+            "94": 0.49072,
+            "95": 0.48777,
+            "96": 0.48656,
+            "97": 0.4864,
+            "98": 0.48473,
+            "99": 0.48278,
+            "100": 0.48492
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json
index 2cf923b859f..a9b971bcaa0 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_fine_grained_offloading/golden_values_dev_dgx_h100.json
@@ -341,4 +341,4 @@
             "50": 1.88268
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json
index 7a7c3bdeab7..926a9c99765 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading/golden_values_dev_dgx_h100.json
@@ -284,4 +284,4 @@
             "50": 1.94595
         }
     }
-}
\ No newline at end of file
+}
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json
index b71266ec7bf..ae215b3314a 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200.json
@@ -218,106 +218,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 793281024.0,
-            "2": 821341184.0,
-            "3": 816778240.0,
-            "4": 803063808.0,
-            "5": 803063808.0,
-            "6": 803063808.0,
-            "7": 807159808.0,
-            "8": 803063808.0,
-            "9": 803063808.0,
-            "10": 805963776.0,
-            "11": 808945664.0,
-            "12": 803063808.0,
-            "13": 803063808.0,
-            "14": 809617408.0,
-            "15": 809576448.0,
-            "16": 803063808.0,
-            "17": 803063808.0,
-            "18": 803063808.0,
-            "19": 803063808.0,
-            "20": 803063808.0,
-            "21": 806301696.0,
-            "22": 809705472.0,
-            "23": 809128960.0,
-            "24": 803063808.0,
-            "25": 803063808.0,
-            "26": 803063808.0,
-            "27": 805701632.0,
-            "28": 806543360.0,
-            "29": 803063808.0,
-            "30": 803063808.0,
-            "31": 803063808.0,
-            "32": 803063808.0,
-            "33": 805914624.0,
-            "34": 808945664.0,
-            "35": 803063808.0,
-            "36": 803063808.0,
-            "37": 803063808.0,
-            "38": 803063808.0,
-            "39": 805909504.0,
-            "40": 805701632.0,
-            "41": 809805824.0,
-            "42": 803063808.0,
-            "43": 803063808.0,
-            "44": 803063808.0,
-            "45": 803063808.0,
-            "46": 809083904.0,
-            "47": 803063808.0,
-            "48": 803063808.0,
-            "49": 803063808.0,
-            "50": 803063808.0,
-            "51": 803063808.0,
-            "52": 803063808.0,
-            "53": 803063808.0,
-            "54": 806274048.0,
-            "55": 805701632.0,
-            "56": 809486336.0,
-            "57": 808945664.0,
-            "58": 805718016.0,
-            "59": 803063808.0,
-            "60": 803063808.0,
-            "61": 803063808.0,
-            "62": 803063808.0,
-            "63": 805701632.0,
-            "64": 808945664.0,
-            "65": 809230336.0,
-            "66": 803063808.0,
-            "67": 805701632.0,
-            "68": 803063808.0,
-            "69": 805701632.0,
-            "70": 809149440.0,
-            "71": 803063808.0,
-            "72": 809519104.0,
-            "73": 803063808.0,
-            "74": 805701632.0,
-            "75": 803063808.0,
-            "76": 805701632.0,
-            "77": 803063808.0,
-            "78": 803063808.0,
-            "79": 808945664.0,
-            "80": 803063808.0,
-            "81": 805701632.0,
-            "82": 809391104.0,
-            "83": 803063808.0,
-            "84": 805954560.0,
-            "85": 803063808.0,
-            "86": 803063808.0,
-            "87": 806307840.0,
-            "88": 803063808.0,
-            "89": 805701632.0,
-            "90": 808945664.0,
-            "91": 803063808.0,
-            "92": 809469952.0,
-            "93": 803063808.0,
-            "94": 806692864.0,
-            "95": 803063808.0,
-            "96": 803063808.0,
-            "97": 809764864.0,
-            "98": 803063808.0,
-            "99": 805701632.0,
-            "100": 803063808.0
+            "1": 804504064.0,
+            "2": 810118144.0,
+            "3": 769756160.0,
+            "4": 800618496.0,
+            "5": 803915776.0,
+            "6": 807639040.0,
+            "7": 803793920.0,
+            "8": 800618496.0,
+            "9": 803862528.0,
+            "10": 804157440.0,
+            "11": 806500352.0,
+            "12": 800618496.0,
+            "13": 803862528.0,
+            "14": 800618496.0,
+            "15": 800618496.0,
+            "16": 803770368.0,
+            "17": 800618496.0,
+            "18": 803256320.0,
+            "19": 807868416.0,
+            "20": 800618496.0,
+            "21": 803862528.0,
+            "22": 800618496.0,
+            "23": 800618496.0,
+            "24": 803849216.0,
+            "25": 807680000.0,
+            "26": 800618496.0,
+            "27": 800618496.0,
+            "28": 803793920.0,
+            "29": 804400128.0,
+            "30": 800618496.0,
+            "31": 800618496.0,
+            "32": 803862528.0,
+            "33": 800618496.0,
+            "34": 800618496.0,
+            "35": 803633152.0,
+            "36": 806551552.0,
+            "37": 800618496.0,
+            "38": 803984384.0,
+            "39": 800618496.0,
+            "40": 800618496.0,
+            "41": 803256320.0,
+            "42": 800618496.0,
+            "43": 800618496.0,
+            "44": 806500352.0,
+            "45": 800618496.0,
+            "46": 803256320.0,
+            "47": 800618496.0,
+            "48": 800618496.0,
+            "49": 803862528.0,
+            "50": 800618496.0,
+            "51": 803862528.0,
+            "52": 806890496.0,
+            "53": 800618496.0,
+            "54": 803256320.0,
+            "55": 803256320.0,
+            "56": 803256320.0,
+            "57": 800618496.0,
+            "58": 800618496.0,
+            "59": 803429376.0,
+            "60": 807037952.0,
+            "61": 800618496.0,
+            "62": 800618496.0,
+            "63": 803256320.0,
+            "64": 800618496.0,
+            "65": 800618496.0,
+            "66": 800618496.0,
+            "67": 803256320.0,
+            "68": 807037952.0,
+            "69": 800618496.0,
+            "70": 800618496.0,
+            "71": 800618496.0,
+            "72": 800618496.0,
+            "73": 803862528.0,
+            "74": 806500352.0,
+            "75": 806500352.0,
+            "76": 800618496.0,
+            "77": 800618496.0,
+            "78": 804001792.0,
+            "79": 807037952.0,
+            "80": 800618496.0,
+            "81": 800618496.0,
+            "82": 800618496.0,
+            "83": 800618496.0,
+            "84": 800618496.0,
+            "85": 800618496.0,
+            "86": 800618496.0,
+            "87": 803862528.0,
+            "88": 800618496.0,
+            "89": 800618496.0,
+            "90": 804154368.0,
+            "91": 807037952.0,
+            "92": 800618496.0,
+            "93": 803256320.0,
+            "94": 806500352.0,
+            "95": 800618496.0,
+            "96": 800618496.0,
+            "97": 806500352.0,
+            "98": 800618496.0,
+            "99": 800618496.0,
+            "100": 800618496.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 953233408.0,
-            "2": 1168079360.0,
-            "3": 1171789824.0,
-            "4": 1171789824.0,
-            "5": 1171789824.0,
-            "6": 1171789824.0,
-            "7": 1171789824.0,
-            "8": 1171789824.0,
-            "9": 1171789824.0,
-            "10": 1171789824.0,
-            "11": 1171789824.0,
-            "12": 1171789824.0,
-            "13": 1171789824.0,
-            "14": 1171789824.0,
-            "15": 1171789824.0,
-            "16": 1171789824.0,
-            "17": 1171789824.0,
-            "18": 1171789824.0,
-            "19": 1171789824.0,
-            "20": 1171789824.0,
-            "21": 1171789824.0,
-            "22": 1171789824.0,
-            "23": 1171789824.0,
-            "24": 1171789824.0,
-            "25": 1171789824.0,
-            "26": 1171789824.0,
-            "27": 1171789824.0,
-            "28": 1171789824.0,
-            "29": 1171789824.0,
-            "30": 1171789824.0,
-            "31": 1171789824.0,
-            "32": 1171789824.0,
-            "33": 1171789824.0,
-            "34": 1171789824.0,
-            "35": 1171789824.0,
-            "36": 1171789824.0,
-            "37": 1171789824.0,
-            "38": 1171789824.0,
-            "39": 1171789824.0,
-            "40": 1171789824.0,
-            "41": 1171789824.0,
-            "42": 1171789824.0,
-            "43": 1171789824.0,
-            "44": 1171789824.0,
-            "45": 1171789824.0,
-            "46": 1171789824.0,
-            "47": 1171789824.0,
-            "48": 1171789824.0,
-            "49": 1171789824.0,
-            "50": 1171789824.0,
-            "51": 1171789824.0,
-            "52": 1171789824.0,
-            "53": 1171789824.0,
-            "54": 1171789824.0,
-            "55": 1171789824.0,
-            "56": 1171789824.0,
-            "57": 1171789824.0,
-            "58": 1171789824.0,
-            "59": 1171789824.0,
-            "60": 1171789824.0,
-            "61": 1171789824.0,
-            "62": 1171789824.0,
-            "63": 1171789824.0,
-            "64": 1171789824.0,
-            "65": 1171789824.0,
-            "66": 1171789824.0,
-            "67": 1171789824.0,
-            "68": 1171789824.0,
-            "69": 1171789824.0,
-            "70": 1171789824.0,
-            "71": 1171789824.0,
-            "72": 1171789824.0,
-            "73": 1171789824.0,
-            "74": 1171789824.0,
-            "75": 1171789824.0,
-            "76": 1171789824.0,
-            "77": 1171789824.0,
-            "78": 1171789824.0,
-            "79": 1171789824.0,
-            "80": 1171789824.0,
-            "81": 1171789824.0,
-            "82": 1171789824.0,
-            "83": 1171789824.0,
-            "84": 1171789824.0,
-            "85": 1171789824.0,
-            "86": 1171789824.0,
-            "87": 1171789824.0,
-            "88": 1171789824.0,
-            "89": 1171789824.0,
-            "90": 1171789824.0,
-            "91": 1171789824.0,
-            "92": 1171789824.0,
-            "93": 1171789824.0,
-            "94": 1171789824.0,
-            "95": 1171789824.0,
-            "96": 1171789824.0,
-            "97": 1171789824.0,
-            "98": 1171789824.0,
-            "99": 1171789824.0,
-            "100": 1171789824.0
+            "1": 945360384.0,
+            "2": 1168858624.0,
+            "3": 1171210240.0,
+            "4": 1171210240.0,
+            "5": 1171210240.0,
+            "6": 1171210240.0,
+            "7": 1171210240.0,
+            "8": 1171210240.0,
+            "9": 1171210240.0,
+            "10": 1171210240.0,
+            "11": 1171210240.0,
+            "12": 1171210240.0,
+            "13": 1171210240.0,
+            "14": 1171210240.0,
+            "15": 1171210240.0,
+            "16": 1171210240.0,
+            "17": 1171210240.0,
+            "18": 1171210240.0,
+            "19": 1171210240.0,
+            "20": 1171210240.0,
+            "21": 1171210240.0,
+            "22": 1171210240.0,
+            "23": 1171210240.0,
+            "24": 1171210240.0,
+            "25": 1171210240.0,
+            "26": 1171210240.0,
+            "27": 1171210240.0,
+            "28": 1171210240.0,
+            "29": 1171210240.0,
+            "30": 1171210240.0,
+            "31": 1171210240.0,
+            "32": 1171210240.0,
+            "33": 1171210240.0,
+            "34": 1171210240.0,
+            "35": 1171210240.0,
+            "36": 1171210240.0,
+            "37": 1171210240.0,
+            "38": 1171210240.0,
+            "39": 1171210240.0,
+            "40": 1171210240.0,
+            "41": 1171210240.0,
+            "42": 1171210240.0,
+            "43": 1171210240.0,
+            "44": 1171210240.0,
+            "45": 1171210240.0,
+            "46": 1171210240.0,
+            "47": 1171210240.0,
+            "48": 1171210240.0,
+            "49": 1171210240.0,
+            "50": 1171210240.0,
+            "51": 1171210240.0,
+            "52": 1171210240.0,
+            "53": 1171210240.0,
+            "54": 1171210240.0,
+            "55": 1171210240.0,
+            "56": 1171210240.0,
+            "57": 1171210240.0,
+            "58": 1171210240.0,
+            "59": 1171210240.0,
+            "60": 1171210240.0,
+            "61": 1171210240.0,
+            "62": 1171210240.0,
+            "63": 1171210240.0,
+            "64": 1171210240.0,
+            "65": 1171210240.0,
+            "66": 1171210240.0,
+            "67": 1171210240.0,
+            "68": 1171210240.0,
+            "69": 1171210240.0,
+            "70": 1171210240.0,
+            "71": 1171210240.0,
+            "72": 1171210240.0,
+            "73": 1171210240.0,
+            "74": 1171210240.0,
+            "75": 1171210240.0,
+            "76": 1171210240.0,
+            "77": 1171210240.0,
+            "78": 1171210240.0,
+            "79": 1171210240.0,
+            "80": 1171210240.0,
+            "81": 1171210240.0,
+            "82": 1171210240.0,
+            "83": 1171210240.0,
+            "84": 1171210240.0,
+            "85": 1171210240.0,
+            "86": 1171210240.0,
+            "87": 1171210240.0,
+            "88": 1171210240.0,
+            "89": 1171210240.0,
+            "90": 1171210240.0,
+            "91": 1171210240.0,
+            "92": 1171210240.0,
+            "93": 1171210240.0,
+            "94": 1171210240.0,
+            "95": 1171210240.0,
+            "96": 1171210240.0,
+            "97": 1171210240.0,
+            "98": 1171210240.0,
+            "99": 1171210240.0,
+            "100": 1171210240.0
         }
     },
     "mtp_1 loss": {
@@ -540,105 +540,105 @@
         "step_interval": 1,
         "values": {
             "1": "nan",
-            "2": 22.16572,
-            "3": 2.30504,
-            "4": 5.77447,
-            "5": 1.26048,
-            "6": 1.23877,
-            "7": 1.23731,
-            "8": 1.23013,
-            "9": 1.23957,
-            "10": 1.22497,
-            "11": 1.23106,
-            "12": 1.23134,
-            "13": 1.23358,
-            "14": 1.2428,
-            "15": 1.20904,
-            "16": 1.2076,
-            "17": 1.21721,
-            "18": 1.20791,
-            "19": 1.21512,
-            "20": 1.21794,
-            "21": 1.22641,
-            "22": 1.2177,
-            "23": 1.20978,
-            "24": 1.21639,
-            "25": 1.21895,
-            "26": 1.22003,
-            "27": 1.22059,
-            "28": 1.22036,
-            "29": 1.25052,
-            "30": 1.20676,
-            "31": 1.20496,
-            "32": 1.21515,
-            "33": 1.21874,
-            "34": 1.23076,
-            "35": 1.21931,
-            "36": 1.21764,
-            "37": 1.21674,
-            "38": 1.21767,
-            "39": 1.21586,
-            "40": 1.21111,
-            "41": 1.21523,
-            "42": 1.22066,
-            "43": 1.21904,
-            "44": 1.22086,
-            "45": 1.22269,
-            "46": 1.2322,
-            "47": 1.22039,
-            "48": 1.21113,
-            "49": 1.21706,
-            "50": 1.21703,
-            "51": 1.30383,
-            "52": 1.29128,
-            "53": 1.23455,
-            "54": 1.2112,
-            "55": 1.21663,
-            "56": 1.21638,
-            "57": 1.21461,
-            "58": 1.26721,
-            "59": 1.24337,
-            "60": 1.21314,
-            "61": 1.21899,
-            "62": 1.22185,
-            "63": 1.21986,
-            "64": 1.21819,
-            "65": 1.21905,
-            "66": 1.22392,
-            "67": 1.24026,
-            "68": 1.2538,
-            "69": 1.25153,
-            "70": 1.24541,
-            "71": 1.24874,
-            "72": 1.24253,
-            "73": 1.24483,
-            "74": 1.24404,
-            "75": 1.25519,
-            "76": 1.24466,
-            "77": 1.24356,
-            "78": 1.24303,
-            "79": 1.24055,
-            "80": 1.23989,
-            "81": 1.23949,
-            "82": 1.23834,
-            "83": 1.23369,
-            "84": 1.24091,
-            "85": 1.24654,
-            "86": 1.23966,
-            "87": 1.23859,
-            "88": 1.24545,
-            "89": 1.24179,
-            "90": 1.24094,
-            "91": 1.2434,
-            "92": 1.24862,
-            "93": 1.24606,
-            "94": 1.24163,
-            "95": 1.24411,
-            "96": 1.24133,
-            "97": 1.24141,
-            "98": 1.2405,
-            "99": 1.24618,
-            "100": 1.2418
+            "2": 37.9854,
+            "3": 2.28734,
+            "4": 5.35708,
+            "5": 1.26954,
+            "6": 1.23701,
+            "7": 1.26847,
+            "8": 1.25887,
+            "9": 1.29096,
+            "10": 1.29547,
+            "11": 1.23851,
+            "12": 1.24484,
+            "13": 1.25435,
+            "14": 1.24699,
+            "15": 1.23703,
+            "16": 1.23908,
+            "17": 1.25246,
+            "18": 1.25344,
+            "19": 1.24098,
+            "20": 1.24674,
+            "21": 1.25568,
+            "22": 1.25153,
+            "23": 1.23949,
+            "24": 1.23356,
+            "25": 1.23457,
+            "26": 1.23399,
+            "27": 1.22253,
+            "28": 1.2421,
+            "29": 1.24505,
+            "30": 1.23731,
+            "31": 1.23779,
+            "32": 1.23651,
+            "33": 1.24334,
+            "34": 1.23672,
+            "35": 1.23579,
+            "36": 1.23856,
+            "37": 1.2368,
+            "38": 1.27227,
+            "39": 1.23626,
+            "40": 1.23108,
+            "41": 1.23937,
+            "42": 1.25102,
+            "43": 1.26268,
+            "44": 1.24696,
+            "45": 1.23983,
+            "46": 1.26043,
+            "47": 1.26387,
+            "48": 1.24094,
+            "49": 1.23779,
+            "50": 1.2443,
+            "51": 1.34841,
+            "52": 1.22881,
+            "53": 1.23572,
+            "54": 1.25464,
+            "55": 1.20016,
+            "56": 1.21853,
+            "57": 1.2627,
+            "58": 1.22524,
+            "59": 1.24359,
+            "60": 1.23647,
+            "61": 1.23076,
+            "62": 1.2306,
+            "63": 1.24009,
+            "64": 1.24563,
+            "65": 1.24999,
+            "66": 1.22054,
+            "67": 1.22448,
+            "68": 1.24504,
+            "69": 1.23668,
+            "70": 1.23376,
+            "71": 1.27933,
+            "72": 1.23658,
+            "73": 1.24651,
+            "74": 1.21592,
+            "75": 1.21953,
+            "76": 1.24411,
+            "77": 1.23279,
+            "78": 1.22593,
+            "79": 1.23226,
+            "80": 1.22822,
+            "81": 1.22087,
+            "82": 1.22161,
+            "83": 1.23068,
+            "84": 1.25262,
+            "85": 1.24203,
+            "86": 1.23057,
+            "87": 1.2358,
+            "88": 1.23979,
+            "89": 1.24089,
+            "90": 1.24457,
+            "91": 1.23347,
+            "92": 1.24498,
+            "93": 1.24886,
+            "94": 1.24372,
+            "95": 1.23659,
+            "96": 1.24767,
+            "97": 1.25003,
+            "98": 1.24688,
+            "99": 1.33174,
+            "100": 1.38391
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200_2nd.json
index 571ec9d4f22..58d969d35d7 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200_2nd.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_gb200_2nd.json
@@ -268,56 +268,56 @@
             "48": "nan",
             "49": "nan",
             "50": "nan",
-            "51": 804615680.0,
-            "52": 799006720.0,
-            "53": 791773184.0,
-            "54": 803888128.0,
-            "55": 803888128.0,
+            "51": 815838720.0,
+            "52": 811278336.0,
+            "53": 768819200.0,
+            "54": 801250304.0,
+            "55": 801250304.0,
             "56": 801250304.0,
-            "57": 807345152.0,
+            "57": 804433920.0,
             "58": 801250304.0,
-            "59": 803888128.0,
-            "60": 801250304.0,
+            "59": 807432192.0,
+            "60": 803888128.0,
             "61": 801250304.0,
-            "62": 801250304.0,
+            "62": 804494336.0,
             "63": 801250304.0,
-            "64": 804319232.0,
-            "65": 804494336.0,
+            "64": 801250304.0,
+            "65": 801250304.0,
             "66": 801250304.0,
             "67": 807934976.0,
             "68": 801250304.0,
-            "69": 801250304.0,
-            "70": 807132160.0,
+            "69": 807132160.0,
+            "70": 803888128.0,
             "71": 801250304.0,
-            "72": 801250304.0,
-            "73": 804445184.0,
-            "74": 801250304.0,
-            "75": 807132160.0,
-            "76": 801250304.0,
+            "72": 803888128.0,
+            "73": 801250304.0,
+            "74": 807132160.0,
+            "75": 801250304.0,
+            "76": 804576256.0,
             "77": 801250304.0,
             "78": 807132160.0,
             "79": 801250304.0,
-            "80": 801250304.0,
-            "81": 803888128.0,
+            "80": 807132160.0,
+            "81": 801250304.0,
             "82": 801250304.0,
-            "83": 808131584.0,
+            "83": 803904512.0,
             "84": 801250304.0,
-            "85": 801250304.0,
-            "86": 803918848.0,
+            "85": 803888128.0,
+            "86": 801250304.0,
             "87": 801250304.0,
             "88": 807259136.0,
-            "89": 801250304.0,
+            "89": 804887552.0,
             "90": 801250304.0,
-            "91": 803888128.0,
+            "91": 808286208.0,
             "92": 801250304.0,
-            "93": 804412416.0,
+            "93": 804426752.0,
             "94": 801250304.0,
             "95": 801250304.0,
-            "96": 803888128.0,
-            "97": 804494336.0,
-            "98": 801250304.0,
-            "99": 803986432.0,
-            "100": 801250304.0
+            "96": 807132160.0,
+            "97": 801250304.0,
+            "98": 803888128.0,
+            "99": 801250304.0,
+            "100": 803888128.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -375,56 +375,56 @@
             "48": "nan",
             "49": "nan",
             "50": "nan",
-            "51": 1159496704.0,
-            "52": 1159496704.0,
-            "53": 1160988672.0,
-            "54": 1160988672.0,
-            "55": 1160988672.0,
-            "56": 1160988672.0,
-            "57": 1160988672.0,
-            "58": 1160988672.0,
-            "59": 1160988672.0,
-            "60": 1160988672.0,
-            "61": 1160988672.0,
-            "62": 1160988672.0,
-            "63": 1160988672.0,
-            "64": 1160988672.0,
-            "65": 1160988672.0,
-            "66": 1160988672.0,
-            "67": 1160988672.0,
-            "68": 1160988672.0,
-            "69": 1160988672.0,
-            "70": 1160988672.0,
-            "71": 1160988672.0,
-            "72": 1160988672.0,
-            "73": 1160988672.0,
-            "74": 1160988672.0,
-            "75": 1160988672.0,
-            "76": 1160988672.0,
-            "77": 1160988672.0,
-            "78": 1160988672.0,
-            "79": 1160988672.0,
-            "80": 1160988672.0,
-            "81": 1160988672.0,
-            "82": 1160988672.0,
-            "83": 1160988672.0,
-            "84": 1160988672.0,
-            "85": 1160988672.0,
-            "86": 1160988672.0,
-            "87": 1160988672.0,
-            "88": 1160988672.0,
-            "89": 1160988672.0,
-            "90": 1160988672.0,
-            "91": 1160988672.0,
-            "92": 1160988672.0,
-            "93": 1160988672.0,
-            "94": 1160988672.0,
-            "95": 1160988672.0,
-            "96": 1160988672.0,
-            "97": 1160988672.0,
-            "98": 1160988672.0,
-            "99": 1160988672.0,
-            "100": 1160988672.0
+            "51": 1154381312.0,
+            "52": 1157301248.0,
+            "53": 1157301248.0,
+            "54": 1157301248.0,
+            "55": 1157301248.0,
+            "56": 1157301248.0,
+            "57": 1157301248.0,
+            "58": 1157301248.0,
+            "59": 1157301248.0,
+            "60": 1157301248.0,
+            "61": 1157301248.0,
+            "62": 1157301248.0,
+            "63": 1157301248.0,
+            "64": 1157301248.0,
+            "65": 1157301248.0,
+            "66": 1157301248.0,
+            "67": 1157301248.0,
+            "68": 1157301248.0,
+            "69": 1157301248.0,
+            "70": 1157301248.0,
+            "71": 1157301248.0,
+            "72": 1157301248.0,
+            "73": 1157301248.0,
+            "74": 1157301248.0,
+            "75": 1157301248.0,
+            "76": 1157301248.0,
+            "77": 1157301248.0,
+            "78": 1157301248.0,
+            "79": 1157301248.0,
+            "80": 1157301248.0,
+            "81": 1157301248.0,
+            "82": 1157301248.0,
+            "83": 1157301248.0,
+            "84": 1157301248.0,
+            "85": 1157301248.0,
+            "86": 1157301248.0,
+            "87": 1157301248.0,
+            "88": 1157301248.0,
+            "89": 1157301248.0,
+            "90": 1157301248.0,
+            "91": 1157301248.0,
+            "92": 1157301248.0,
+            "93": 1157301248.0,
+            "94": 1157301248.0,
+            "95": 1157301248.0,
+            "96": 1157301248.0,
+            "97": 1157301248.0,
+            "98": 1157301248.0,
+            "99": 1157301248.0,
+            "100": 1157301248.0
         }
     },
     "mtp_1 loss": {
@@ -590,55 +590,55 @@
             "49": "nan",
             "50": "nan",
             "51": "nan",
-            "52": 21.21617,
-            "53": 2.296,
-            "54": 5.66579,
-            "55": 1.2943,
-            "56": 1.27825,
-            "57": 1.24678,
-            "58": 1.25387,
-            "59": 1.25978,
-            "60": 1.25196,
-            "61": 1.26083,
-            "62": 1.2504,
-            "63": 1.26437,
-            "64": 1.26032,
-            "65": 1.30794,
-            "66": 1.25435,
-            "67": 1.25786,
-            "68": 1.26363,
-            "69": 1.27923,
-            "70": 1.2669,
-            "71": 1.26184,
-            "72": 1.26621,
-            "73": 1.26733,
-            "74": 1.25926,
-            "75": 1.26859,
-            "76": 1.26357,
-            "77": 1.26772,
-            "78": 1.26617,
-            "79": 1.26098,
-            "80": 1.2611,
-            "81": 1.26559,
-            "82": 1.26671,
-            "83": 1.26324,
-            "84": 1.26585,
-            "85": 1.265,
-            "86": 1.26029,
-            "87": 1.26344,
-            "88": 1.25979,
-            "89": 1.26481,
-            "90": 1.25944,
-            "91": 1.25837,
-            "92": 1.25308,
-            "93": 1.26608,
-            "94": 1.26648,
-            "95": 1.26382,
-            "96": 1.26434,
-            "97": 1.23869,
-            "98": 1.25822,
-            "99": 1.3128,
-            "100": 1.26109
+            "52": 25.03804,
+            "53": 2.27039,
+            "54": 5.50328,
+            "55": 1.30296,
+            "56": 1.27392,
+            "57": 1.27044,
+            "58": 1.25885,
+            "59": 1.27988,
+            "60": 1.29031,
+            "61": 1.26003,
+            "62": 1.26642,
+            "63": 1.28317,
+            "64": 1.25326,
+            "65": 1.26451,
+            "66": 1.28795,
+            "67": 1.26476,
+            "68": 1.2681,
+            "69": 1.25901,
+            "70": 1.27317,
+            "71": 1.27037,
+            "72": 1.26026,
+            "73": 1.27251,
+            "74": 1.25882,
+            "75": 1.23849,
+            "76": 1.24757,
+            "77": 1.28422,
+            "78": 1.26169,
+            "79": 1.26126,
+            "80": 1.26248,
+            "81": 1.27668,
+            "82": 1.27,
+            "83": 1.26392,
+            "84": 1.26783,
+            "85": 1.26175,
+            "86": 1.25958,
+            "87": 1.25019,
+            "88": 1.25222,
+            "89": 1.29661,
+            "90": 1.27086,
+            "91": 1.25742,
+            "92": 1.26072,
+            "93": 1.23917,
+            "94": 1.24963,
+            "95": 1.25132,
+            "96": 1.24286,
+            "97": 1.24685,
+            "98": 1.25689,
+            "99": 1.2545,
+            "100": 1.26507
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json
index 1a8739a347a..cc3963c29d9 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100.json
@@ -219,105 +219,105 @@
         "step_interval": 1,
         "values": {
             "1": 815727104.0,
-            "2": 793283584.0,
-            "3": 835202048.0,
-            "4": 807708672.0,
-            "5": 808458240.0,
-            "6": 804466688.0,
-            "7": 800862208.0,
-            "8": 808080384.0,
-            "9": 807653376.0,
-            "10": 800862208.0,
-            "11": 808208384.0,
-            "12": 807350272.0,
-            "13": 800862208.0,
-            "14": 808185856.0,
-            "15": 807186432.0,
-            "16": 800862208.0,
-            "17": 807448576.0,
-            "18": 807286784.0,
-            "19": 800862208.0,
-            "20": 807684096.0,
-            "21": 807890944.0,
-            "22": 800862208.0,
-            "23": 808510464.0,
-            "24": 808387584.0,
-            "25": 804035584.0,
-            "26": 800862208.0,
-            "27": 807825408.0,
-            "28": 806744064.0,
-            "29": 800862208.0,
-            "30": 807996416.0,
-            "31": 807682048.0,
-            "32": 803694592.0,
-            "33": 800862208.0,
-            "34": 807350272.0,
-            "35": 806928384.0,
-            "36": 800862208.0,
-            "37": 807727104.0,
-            "38": 807112704.0,
-            "39": 800862208.0,
-            "40": 807997440.0,
-            "41": 807677952.0,
-            "42": 803500032.0,
-            "43": 800862208.0,
-            "44": 807511040.0,
-            "45": 807274496.0,
-            "46": 800862208.0,
-            "47": 807894016.0,
-            "48": 807567360.0,
-            "49": 803500032.0,
-            "50": 800862208.0,
-            "51": 808185856.0,
-            "52": 804275200.0,
-            "53": 800862208.0,
-            "54": 807925760.0,
-            "55": 807542784.0,
-            "56": 803713024.0,
-            "57": 800862208.0,
-            "58": 807350272.0,
-            "59": 807393280.0,
-            "60": 800862208.0,
-            "61": 807858176.0,
-            "62": 807350272.0,
-            "63": 803500032.0,
-            "64": 800862208.0,
-            "65": 807711744.0,
-            "66": 807874560.0,
-            "67": 803500032.0,
-            "68": 807350272.0,
-            "69": 807612416.0,
-            "70": 804241408.0,
-            "71": 800862208.0,
-            "72": 807661568.0,
-            "73": 806744064.0,
-            "74": 800862208.0,
-            "75": 807350272.0,
-            "76": 807350272.0,
-            "77": 800862208.0,
-            "78": 800862208.0,
-            "79": 807976960.0,
-            "80": 808120320.0,
-            "81": 804230144.0,
-            "82": 800862208.0,
-            "83": 808173568.0,
-            "84": 807569408.0,
-            "85": 803500032.0,
-            "86": 800862208.0,
-            "87": 807350272.0,
-            "88": 807421952.0,
-            "89": 803729408.0,
-            "90": 800862208.0,
-            "91": 808009728.0,
-            "92": 808318976.0,
-            "93": 806807552.0,
-            "94": 800862208.0,
-            "95": 800862208.0,
-            "96": 808192000.0,
-            "97": 807350272.0,
-            "98": 806744064.0,
-            "99": 800862208.0,
-            "100": 807636992.0
+            "2": 845646848.0,
+            "3": 774048768.0,
+            "4": 807958528.0,
+            "5": 801470464.0,
+            "6": 804525056.0,
+            "7": 808376320.0,
+            "8": 807958528.0,
+            "9": 807958528.0,
+            "10": 809364480.0,
+            "11": 808515584.0,
+            "12": 807958528.0,
+            "13": 808409088.0,
+            "14": 807958528.0,
+            "15": 807958528.0,
+            "16": 807352320.0,
+            "17": 807578624.0,
+            "18": 808315904.0,
+            "19": 809019392.0,
+            "20": 809110528.0,
+            "21": 808971264.0,
+            "22": 807958528.0,
+            "23": 808331264.0,
+            "24": 808644608.0,
+            "25": 808794112.0,
+            "26": 809303040.0,
+            "27": 807958528.0,
+            "28": 808663040.0,
+            "29": 808248320.0,
+            "30": 808678400.0,
+            "31": 808659968.0,
+            "32": 807958528.0,
+            "33": 808320000.0,
+            "34": 807352320.0,
+            "35": 807352320.0,
+            "36": 807602176.0,
+            "37": 808199168.0,
+            "38": 807802880.0,
+            "39": 807634944.0,
+            "40": 807902208.0,
+            "41": 807958528.0,
+            "42": 805165056.0,
+            "43": 807802880.0,
+            "44": 808007680.0,
+            "45": 807802880.0,
+            "46": 807514112.0,
+            "47": 807763968.0,
+            "48": 807569408.0,
+            "49": 808332288.0,
+            "50": 807958528.0,
+            "51": 801470464.0,
+            "52": 801470464.0,
+            "53": 801470464.0,
+            "54": 801470464.0,
+            "55": 804255744.0,
+            "56": 805165056.0,
+            "57": 807958528.0,
+            "58": 808314880.0,
+            "59": 808409088.0,
+            "60": 807958528.0,
+            "61": 808409088.0,
+            "62": 808889344.0,
+            "63": 808204288.0,
+            "64": 801470464.0,
+            "65": 801470464.0,
+            "66": 801470464.0,
+            "67": 801470464.0,
+            "68": 801470464.0,
+            "69": 801470464.0,
+            "70": 801470464.0,
+            "71": 804452352.0,
+            "72": 804255744.0,
+            "73": 805165056.0,
+            "74": 807472128.0,
+            "75": 808038400.0,
+            "76": 808294400.0,
+            "77": 808305664.0,
+            "78": 807997440.0,
+            "79": 808916992.0,
+            "80": 807958528.0,
+            "81": 808547328.0,
+            "82": 808499200.0,
+            "83": 808409088.0,
+            "84": 807958528.0,
+            "85": 807958528.0,
+            "86": 808828928.0,
+            "87": 808116224.0,
+            "88": 808030208.0,
+            "89": 808409088.0,
+            "90": 807958528.0,
+            "91": 801470464.0,
+            "92": 801470464.0,
+            "93": 801470464.0,
+            "94": 801470464.0,
+            "95": 801470464.0,
+            "96": 801470464.0,
+            "97": 801470464.0,
+            "98": 804943872.0,
+            "99": 804108288.0,
+            "100": 804558848.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -325,106 +325,106 @@
         "end_step": 100,
         "step_interval": 1,
         "values": {
-            "1": 964055552.0,
-            "2": 1186166272.0,
-            "3": 1196239360.0,
-            "4": 1196239360.0,
-            "5": 1196239360.0,
-            "6": 1196239360.0,
-            "7": 1196239360.0,
-            "8": 1196239360.0,
-            "9": 1196239360.0,
-            "10": 1196239360.0,
-            "11": 1196239360.0,
-            "12": 1196239360.0,
-            "13": 1196239360.0,
-            "14": 1196239360.0,
-            "15": 1196239360.0,
-            "16": 1196239360.0,
-            "17": 1196239360.0,
-            "18": 1196239360.0,
-            "19": 1196239360.0,
-            "20": 1196239360.0,
-            "21": 1196239360.0,
-            "22": 1196239360.0,
-            "23": 1196239360.0,
-            "24": 1196239360.0,
-            "25": 1196239360.0,
-            "26": 1196239360.0,
-            "27": 1196239360.0,
-            "28": 1196239360.0,
-            "29": 1196239360.0,
-            "30": 1196239360.0,
-            "31": 1196239360.0,
-            "32": 1196239360.0,
-            "33": 1196239360.0,
-            "34": 1196239360.0,
-            "35": 1196239360.0,
-            "36": 1196239360.0,
-            "37": 1196239360.0,
-            "38": 1196239360.0,
-            "39": 1196239360.0,
-            "40": 1196239360.0,
-            "41": 1196239360.0,
-            "42": 1196239360.0,
-            "43": 1196239360.0,
-            "44": 1196239360.0,
-            "45": 1196239360.0,
-            "46": 1196239360.0,
-            "47": 1196239360.0,
-            "48": 1196239360.0,
-            "49": 1196239360.0,
-            "50": 1196239360.0,
-            "51": 1196239360.0,
-            "52": 1196239360.0,
-            "53": 1196239360.0,
-            "54": 1196239360.0,
-            "55": 1196239360.0,
-            "56": 1196239360.0,
-            "57": 1196239360.0,
-            "58": 1196239360.0,
-            "59": 1196239360.0,
-            "60": 1196239360.0,
-            "61": 1196239360.0,
-            "62": 1196239360.0,
-            "63": 1196239360.0,
-            "64": 1196239360.0,
-            "65": 1196239360.0,
-            "66": 1196239360.0,
-            "67": 1196239360.0,
-            "68": 1196239360.0,
-            "69": 1196239360.0,
-            "70": 1196239360.0,
-            "71": 1196239360.0,
-            "72": 1196239360.0,
-            "73": 1196239360.0,
-            "74": 1196239360.0,
-            "75": 1196239360.0,
-            "76": 1196239360.0,
-            "77": 1196239360.0,
-            "78": 1196239360.0,
-            "79": 1196239360.0,
-            "80": 1196239360.0,
-            "81": 1196239360.0,
-            "82": 1196239360.0,
-            "83": 1196239360.0,
-            "84": 1196239360.0,
-            "85": 1196239360.0,
-            "86": 1196239360.0,
-            "87": 1196239360.0,
-            "88": 1196239360.0,
-            "89": 1196239360.0,
-            "90": 1196239360.0,
-            "91": 1196239360.0,
-            "92": 1196239360.0,
-            "93": 1196239360.0,
-            "94": 1196239360.0,
-            "95": 1196239360.0,
-            "96": 1196239360.0,
-            "97": 1196239360.0,
-            "98": 1196239360.0,
-            "99": 1196239360.0,
-            "100": 1196239360.0
+            "1": 966438400.0,
+            "2": 1183200256.0,
+            "3": 1189751296.0,
+            "4": 1189751296.0,
+            "5": 1189751296.0,
+            "6": 1189751296.0,
+            "7": 1189751296.0,
+            "8": 1189751296.0,
+            "9": 1189751296.0,
+            "10": 1189751296.0,
+            "11": 1189751296.0,
+            "12": 1189751296.0,
+            "13": 1189751296.0,
+            "14": 1189751296.0,
+            "15": 1189751296.0,
+            "16": 1189751296.0,
+            "17": 1189751296.0,
+            "18": 1189751296.0,
+            "19": 1189751296.0,
+            "20": 1189751296.0,
+            "21": 1189751296.0,
+            "22": 1189751296.0,
+            "23": 1189751296.0,
+            "24": 1189751296.0,
+            "25": 1189751296.0,
+            "26": 1189751296.0,
+            "27": 1189751296.0,
+            "28": 1189751296.0,
+            "29": 1189751296.0,
+            "30": 1189751296.0,
+            "31": 1189751296.0,
+            "32": 1189751296.0,
+            "33": 1189751296.0,
+            "34": 1189751296.0,
+            "35": 1189751296.0,
+            "36": 1189751296.0,
+            "37": 1189751296.0,
+            "38": 1189751296.0,
+            "39": 1189751296.0,
+            "40": 1189751296.0,
+            "41": 1189751296.0,
+            "42": 1189751296.0,
+            "43": 1189751296.0,
+            "44": 1189751296.0,
+            "45": 1189751296.0,
+            "46": 1189751296.0,
+            "47": 1189751296.0,
+            "48": 1189751296.0,
+            "49": 1189751296.0,
+            "50": 1189751296.0,
+            "51": 1189751296.0,
+            "52": 1189751296.0,
+            "53": 1189751296.0,
+            "54": 1189751296.0,
+            "55": 1189751296.0,
+            "56": 1189751296.0,
+            "57": 1189751296.0,
+            "58": 1189751296.0,
+            "59": 1189751296.0,
+            "60": 1189751296.0,
+            "61": 1189751296.0,
+            "62": 1189751296.0,
+            "63": 1189751296.0,
+            "64": 1189751296.0,
+            "65": 1189751296.0,
+            "66": 1189751296.0,
+            "67": 1189751296.0,
+            "68": 1189751296.0,
+            "69": 1189751296.0,
+            "70": 1189751296.0,
+            "71": 1189751296.0,
+            "72": 1189751296.0,
+            "73": 1189751296.0,
+            "74": 1189751296.0,
+            "75": 1189751296.0,
+            "76": 1189751296.0,
+            "77": 1189751296.0,
+            "78": 1189751296.0,
+            "79": 1189751296.0,
+            "80": 1189751296.0,
+            "81": 1189751296.0,
+            "82": 1189751296.0,
+            "83": 1189751296.0,
+            "84": 1189751296.0,
+            "85": 1189751296.0,
+            "86": 1189751296.0,
+            "87": 1189751296.0,
+            "88": 1189751296.0,
+            "89": 1189751296.0,
+            "90": 1189751296.0,
+            "91": 1189751296.0,
+            "92": 1189751296.0,
+            "93": 1189751296.0,
+            "94": 1189751296.0,
+            "95": 1189751296.0,
+            "96": 1189751296.0,
+            "97": 1189751296.0,
+            "98": 1189751296.0,
+            "99": 1189751296.0,
+            "100": 1189751296.0
         }
     },
     "mtp_1 loss": {
@@ -540,105 +540,105 @@
         "step_interval": 1,
         "values": {
             "1": "nan",
-            "2": 18.24385,
-            "3": 1.24992,
-            "4": 3.44822,
-            "5": 0.67115,
-            "6": 0.67992,
-            "7": 0.67235,
-            "8": 0.67169,
-            "9": 0.67242,
-            "10": 0.66924,
-            "11": 0.67194,
-            "12": 0.66696,
-            "13": 0.66783,
-            "14": 0.66806,
-            "15": 0.66792,
-            "16": 0.66722,
-            "17": 0.66884,
-            "18": 0.66891,
-            "19": 0.67057,
-            "20": 0.67024,
-            "21": 0.67476,
-            "22": 0.6704,
-            "23": 0.66892,
-            "24": 0.67043,
-            "25": 0.67258,
-            "26": 0.67099,
-            "27": 0.67203,
-            "28": 0.67141,
-            "29": 0.67162,
-            "30": 0.67618,
-            "31": 0.67022,
-            "32": 0.68537,
-            "33": 0.67019,
-            "34": 0.66964,
-            "35": 0.67288,
-            "36": 0.66938,
-            "37": 0.67603,
-            "38": 0.66977,
-            "39": 0.67445,
-            "40": 0.67455,
-            "41": 0.6717,
-            "42": 0.67202,
-            "43": 0.67216,
-            "44": 0.67433,
-            "45": 0.67073,
-            "46": 0.6702,
-            "47": 0.67187,
-            "48": 0.67865,
-            "49": 0.67059,
-            "50": 0.67768,
-            "51": 0.7684,
-            "52": 0.67476,
-            "53": 0.67246,
-            "54": 0.67426,
-            "55": 0.67523,
-            "56": 0.67431,
-            "57": 0.67379,
-            "58": 0.67592,
-            "59": 0.67389,
-            "60": 0.67679,
-            "61": 0.67409,
-            "62": 0.67265,
-            "63": 0.67543,
-            "64": 0.67577,
-            "65": 0.6745,
-            "66": 0.67687,
-            "67": 0.67327,
-            "68": 0.67244,
-            "69": 0.67241,
-            "70": 0.67191,
-            "71": 0.67044,
-            "72": 0.67049,
-            "73": 0.67597,
-            "74": 0.67288,
-            "75": 0.67123,
-            "76": 0.67032,
-            "77": 0.66955,
-            "78": 0.68133,
-            "79": 0.67997,
-            "80": 0.68011,
-            "81": 0.68168,
-            "82": 0.68012,
-            "83": 0.68054,
-            "84": 0.67091,
-            "85": 0.67421,
-            "86": 0.67093,
-            "87": 0.68073,
-            "88": 0.67264,
-            "89": 0.67707,
-            "90": 0.6819,
-            "91": 0.67945,
-            "92": 0.6829,
-            "93": 0.68217,
-            "94": 0.68005,
-            "95": 0.68097,
-            "96": 0.68088,
-            "97": 0.68296,
-            "98": 0.68201,
-            "99": 0.67603,
-            "100": 0.67638
+            "2": 26.19469,
+            "3": 1.32772,
+            "4": 3.30644,
+            "5": 0.72164,
+            "6": 0.71871,
+            "7": 0.718,
+            "8": 0.72768,
+            "9": 0.70555,
+            "10": 0.70505,
+            "11": 0.71334,
+            "12": 0.70522,
+            "13": 0.69925,
+            "14": 0.703,
+            "15": 0.69939,
+            "16": 0.76765,
+            "17": 0.70417,
+            "18": 0.70237,
+            "19": 0.71136,
+            "20": 0.70228,
+            "21": 0.72275,
+            "22": 0.70136,
+            "23": 0.702,
+            "24": 0.70072,
+            "25": 0.7023,
+            "26": 0.7039,
+            "27": 0.70004,
+            "28": 0.7007,
+            "29": 0.69956,
+            "30": 0.7023,
+            "31": 0.7007,
+            "32": 0.69906,
+            "33": 0.69854,
+            "34": 0.69865,
+            "35": 0.70116,
+            "36": 0.70353,
+            "37": 0.70079,
+            "38": 0.69987,
+            "39": 0.70046,
+            "40": 0.70343,
+            "41": 0.70005,
+            "42": 0.69826,
+            "43": 0.71404,
+            "44": 0.70571,
+            "45": 0.70214,
+            "46": 0.69997,
+            "47": 0.70441,
+            "48": 0.70629,
+            "49": 0.70468,
+            "50": 0.70394,
+            "51": 0.78105,
+            "52": 0.7043,
+            "53": 0.70598,
+            "54": 0.70646,
+            "55": 0.7085,
+            "56": 0.71297,
+            "57": 0.71289,
+            "58": 0.71121,
+            "59": 0.70804,
+            "60": 0.71034,
+            "61": 0.71023,
+            "62": 0.70859,
+            "63": 0.70742,
+            "64": 0.71163,
+            "65": 0.70886,
+            "66": 0.70745,
+            "67": 0.70563,
+            "68": 0.70919,
+            "69": 0.70593,
+            "70": 0.70451,
+            "71": 0.70923,
+            "72": 0.70573,
+            "73": 0.70729,
+            "74": 0.70519,
+            "75": 0.70927,
+            "76": 0.70712,
+            "77": 0.70607,
+            "78": 0.70576,
+            "79": 0.70821,
+            "80": 0.70711,
+            "81": 0.70637,
+            "82": 0.70783,
+            "83": 0.70864,
+            "84": 0.70915,
+            "85": 0.70532,
+            "86": 0.70411,
+            "87": 0.70141,
+            "88": 0.7022,
+            "89": 0.70315,
+            "90": 0.70524,
+            "91": 0.70492,
+            "92": 0.70408,
+            "93": 0.70707,
+            "94": 0.7111,
+            "95": 0.70593,
+            "96": 0.70731,
+            "97": 0.70753,
+            "98": 0.7058,
+            "99": 0.70601,
+            "100": 0.70733
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100_2nd.json b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100_2nd.json
index 3f15c7a8c18..357c399d4b7 100644
--- a/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100_2nd.json
+++ b/tests/functional_tests/test_cases/moe/gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_scoped_cudagraph/golden_values_dev_dgx_h100_2nd.json
@@ -269,55 +269,55 @@
             "49": "nan",
             "50": "nan",
             "51": 815727104.0,
-            "52": 818400256.0,
-            "53": 810118144.0,
-            "54": 807626752.0,
-            "55": 803862528.0,
-            "56": 800618496.0,
-            "57": 807106560.0,
-            "58": 807219200.0,
-            "59": 807020544.0,
-            "60": 803729408.0,
-            "61": 800618496.0,
-            "62": 808046592.0,
-            "63": 807908352.0,
-            "64": 807020544.0,
-            "65": 800618496.0,
-            "66": 800618496.0,
-            "67": 807971840.0,
-            "68": 807896064.0,
-            "69": 806615040.0,
-            "70": 800618496.0,
-            "71": 807626752.0,
-            "72": 807979008.0,
-            "73": 803950592.0,
-            "74": 807808000.0,
-            "75": 807626752.0,
-            "76": 803256320.0,
-            "77": 800618496.0,
-            "78": 807626752.0,
-            "79": 803256320.0,
-            "80": 800618496.0,
-            "81": 807626752.0,
-            "82": 807398400.0,
-            "83": 800618496.0,
-            "84": 807663616.0,
-            "85": 807626752.0,
-            "86": 803256320.0,
-            "87": 807761920.0,
-            "88": 807758848.0,
-            "89": 803256320.0,
-            "90": 800618496.0,
-            "91": 807663616.0,
-            "92": 804141056.0,
-            "93": 800618496.0,
-            "94": 807626752.0,
-            "95": 807020544.0,
-            "96": 800618496.0,
-            "97": 807871488.0,
-            "98": 807020544.0,
-            "99": 800618496.0,
-            "100": 808312832.0
+            "52": 787672064.0,
+            "53": 845876224.0,
+            "54": 801167360.0,
+            "55": 801167360.0,
+            "56": 801167360.0,
+            "57": 807655424.0,
+            "58": 808161280.0,
+            "59": 808177664.0,
+            "60": 808276992.0,
+            "61": 808161280.0,
+            "62": 807702528.0,
+            "63": 807049216.0,
+            "64": 803805184.0,
+            "65": 803805184.0,
+            "66": 801167360.0,
+            "67": 801167360.0,
+            "68": 808161280.0,
+            "69": 807655424.0,
+            "70": 808161280.0,
+            "71": 808161280.0,
+            "72": 807655424.0,
+            "73": 807766016.0,
+            "74": 804411392.0,
+            "75": 807555072.0,
+            "76": 807876608.0,
+            "77": 807049216.0,
+            "78": 804411392.0,
+            "79": 804723712.0,
+            "80": 804575232.0,
+            "81": 804542464.0,
+            "82": 803805184.0,
+            "83": 801167360.0,
+            "84": 801167360.0,
+            "85": 801167360.0,
+            "86": 801167360.0,
+            "87": 801167360.0,
+            "88": 801167360.0,
+            "89": 801167360.0,
+            "90": 801167360.0,
+            "91": 801167360.0,
+            "92": 807655424.0,
+            "93": 807655424.0,
+            "94": 807752704.0,
+            "95": 807655424.0,
+            "96": 808343552.0,
+            "97": 808326144.0,
+            "98": 807655424.0,
+            "99": 807655424.0,
+            "100": 808161280.0
         }
     },
     "mem-max-allocated-bytes": {
@@ -375,56 +375,56 @@
             "48": "nan",
             "49": "nan",
             "50": "nan",
-            "51": 1184825856.0,
-            "52": 1184825856.0,
-            "53": 1184825856.0,
-            "54": 1184825856.0,
-            "55": 1184825856.0,
-            "56": 1184825856.0,
-            "57": 1184825856.0,
-            "58": 1184825856.0,
-            "59": 1184825856.0,
-            "60": 1184825856.0,
-            "61": 1184825856.0,
-            "62": 1184825856.0,
-            "63": 1184825856.0,
-            "64": 1184825856.0,
-            "65": 1184825856.0,
-            "66": 1184825856.0,
-            "67": 1184825856.0,
-            "68": 1184825856.0,
-            "69": 1184825856.0,
-            "70": 1184825856.0,
-            "71": 1184825856.0,
-            "72": 1184825856.0,
-            "73": 1184825856.0,
-            "74": 1184825856.0,
-            "75": 1184825856.0,
-            "76": 1184825856.0,
-            "77": 1184825856.0,
-            "78": 1184825856.0,
-            "79": 1184825856.0,
-            "80": 1184825856.0,
-            "81": 1184825856.0,
-            "82": 1184825856.0,
-            "83": 1184825856.0,
-            "84": 1184825856.0,
-            "85": 1184825856.0,
-            "86": 1184825856.0,
-            "87": 1184825856.0,
-            "88": 1184825856.0,
-            "89": 1184825856.0,
-            "90": 1184825856.0,
-            "91": 1184825856.0,
-            "92": 1184825856.0,
-            "93": 1184825856.0,
-            "94": 1184825856.0,
-            "95": 1184825856.0,
-            "96": 1184825856.0,
-            "97": 1184825856.0,
-            "98": 1184825856.0,
-            "99": 1184825856.0,
-            "100": 1184825856.0
+            "51": 1183360512.0,
+            "52": 1183360512.0,
+            "53": 1183360512.0,
+            "54": 1183360512.0,
+            "55": 1183360512.0,
+            "56": 1183360512.0,
+            "57": 1183360512.0,
+            "58": 1183360512.0,
+            "59": 1183360512.0,
+            "60": 1183360512.0,
+            "61": 1183360512.0,
+            "62": 1183360512.0,
+            "63": 1183360512.0,
+            "64": 1183360512.0,
+            "65": 1183360512.0,
+            "66": 1183360512.0,
+            "67": 1183360512.0,
+            "68": 1183360512.0,
+            "69": 1183360512.0,
+            "70": 1183360512.0,
+            "71": 1183360512.0,
+            "72": 1183360512.0,
+            "73": 1183360512.0,
+            "74": 1183360512.0,
+            "75": 1183360512.0,
+            "76": 1183360512.0,
+            "77": 1183360512.0,
+            "78": 1183360512.0,
+            "79": 1183360512.0,
+            "80": 1183360512.0,
+            "81": 1183360512.0,
+            "82": 1183360512.0,
+            "83": 1183360512.0,
+            "84": 1183360512.0,
+            "85": 1183360512.0,
+            "86": 1183360512.0,
+            "87": 1183360512.0,
+            "88": 1183360512.0,
+            "89": 1183360512.0,
+            "90": 1183360512.0,
+            "91": 1183360512.0,
+            "92": 1183360512.0,
+            "93": 1183360512.0,
+            "94": 1183360512.0,
+            "95": 1183360512.0,
+            "96": 1183360512.0,
+            "97": 1183360512.0,
+            "98": 1183360512.0,
+            "99": 1183360512.0,
+            "100": 1183360512.0
         }
     },
     "mtp_1 loss": {
@@ -590,55 +590,55 @@
             "49": "nan",
             "50": "nan",
             "51": "nan",
-            "52": 20.08109,
-            "53": 1.44514,
-            "54": 3.53908,
-            "55": 0.6804,
-            "56": 0.67925,
-            "57": 0.67899,
-            "58": 0.68158,
-            "59": 0.67854,
-            "60": 0.67794,
-            "61": 0.68106,
-            "62": 0.68357,
-            "63": 0.67949,
-            "64": 0.68018,
-            "65": 0.68161,
-            "66": 0.68619,
-            "67": 0.67989,
-            "68": 0.67966,
-            "69": 0.6946,
-            "70": 0.67236,
-            "71": 0.67144,
-            "72": 0.66944,
-            "73": 0.67325,
-            "74": 0.66956,
-            "75": 0.66889,
-            "76": 0.66874,
-            "77": 0.67063,
-            "78": 0.67241,
-            "79": 0.67054,
-            "80": 0.6696,
-            "81": 0.67387,
-            "82": 0.67081,
-            "83": 0.67698,
-            "84": 0.66975,
-            "85": 0.67749,
-            "86": 0.66897,
-            "87": 0.66989,
-            "88": 0.672,
-            "89": 0.6687,
-            "90": 0.66866,
-            "91": 0.67163,
-            "92": 0.67312,
-            "93": 0.66877,
-            "94": 0.66989,
-            "95": 0.67069,
-            "96": 0.66771,
-            "97": 0.6697,
-            "98": 0.67028,
-            "99": 0.6707,
-            "100": 0.67357
+            "52": 18.09211,
+            "53": 1.28775,
+            "54": 3.24511,
+            "55": 0.69966,
+            "56": 0.69376,
+            "57": 0.69117,
+            "58": 0.69316,
+            "59": 0.69638,
+            "60": 0.69853,
+            "61": 0.6932,
+            "62": 0.69475,
+            "63": 0.69227,
+            "64": 0.69319,
+            "65": 0.69359,
+            "66": 0.69243,
+            "67": 0.69192,
+            "68": 0.69118,
+            "69": 0.6942,
+            "70": 0.69339,
+            "71": 0.69124,
+            "72": 0.69449,
+            "73": 0.69318,
+            "74": 0.69259,
+            "75": 0.71185,
+            "76": 0.69973,
+            "77": 0.70144,
+            "78": 0.70035,
+            "79": 0.69919,
+            "80": 0.69907,
+            "81": 0.69984,
+            "82": 0.69858,
+            "83": 0.69906,
+            "84": 0.69911,
+            "85": 0.69917,
+            "86": 0.70099,
+            "87": 0.70055,
+            "88": 0.70014,
+            "89": 0.69921,
+            "90": 0.69993,
+            "91": 0.699,
+            "92": 0.69868,
+            "93": 0.69815,
+            "94": 0.69879,
+            "95": 0.69886,
+            "96": 0.69898,
+            "97": 0.69722,
+            "98": 0.69942,
+            "99": 0.70228,
+            "100": 0.69863
         }
     }
 }
\ No newline at end of file
diff --git a/tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp1pp1ep8/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp1pp1ep8/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp2pp2ep4/golden_values_dev_dgx_h100.json b/tests/functional_tests/test_cases/moe2.0/golden_values/dsv3_tp2pp2ep4/golden_values_dev_dgx_h100.json
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/functional_tests/test_cases/moe2.0/model_configs/dsv3_proxy.yaml b/tests/functional_tests/test_cases/moe2.0/model_configs/dsv3_proxy.yaml
new file mode 100644
index 00000000000..70924aed0cc
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe2.0/model_configs/dsv3_proxy.yaml
@@ -0,0 +1,85 @@
+MODEL_ARGS:
+  # Data args
+  --seq-length: 4096
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  # Add transformer base args
+  --num-layers: 16
+  --hidden-size: 1024
+  --normalization: RMSNorm
+  --norm-epsilon: 1e-6
+  --disable-bias-linear: true
+  --max-position-embeddings: 4096
+  --make-vocab-size-divisible-by: 3232
+  --untie-embeddings-and-output-weights: true
+  # Add attention related args
+  --multi-latent-attention: true
+  --num-attention-heads: 32
+  --kv-channels: 128
+  --qk-layernorm: true
+  --position-embedding-type: rope
+  --rotary-base: 10000
+  --q-lora-rank: 1536
+  --kv-lora-rank: 512
+  --qk-head-dim: 128
+  --qk-pos-emb-head-dim: 64
+  --v-head-dim: 128
+  --rotary-scaling-factor: 40
+  --mscale: 1.0
+  --mscale-all-dim: 1.0
+  # Add MLP related args
+  --swiglu: true
+  --ffn-hidden-size: 4096
+  # Add MoE args
+  --num-experts: 32
+  --moe-layer-freq: ([0]*1+[1]*15)
+  --moe-ffn-hidden-size: 1024
+  --moe-shared-expert-intermediate-size: 1024
+  --moe-router-load-balancing-type: seq_aux_loss
+  --moe-router-topk: 4
+  --moe-router-pre-softmax: true
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-4
+  --moe-router-group-topk: 2
+  --moe-router-num-groups: 4
+  --moe-router-topk-scaling-factor: 2.0
+  --moe-router-score-function: sigmoid
+  --moe-router-enable-expert-bias: true
+  --moe-router-bias-update-rate: 1e-3
+  --moe-router-dtype: fp32
+  # Comment out the following MTP args to disable MTP
+  --mtp-num-layers: 1
+  --mtp-loss-scaling-factor: 0.1
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+  # Add learning rate args
+  --lr-warmup-fraction: .01
+  --lr: 0.00015
+  --min-lr: 1.0e-5
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+  # Add initialization args
+  --init-method-std: 0.02
+  # Training args
+  --global-batch-size: 32
+  --train-iters: 50
+  --exit-duration-in-mins: 230
+  --no-check-for-nan-in-loss-and-grad: true
+
+METRICS:
+  - "lm loss"
+  - "num-zeros"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
+  - "mtp_1 loss"
+  - "seq_load_balancing_loss"
diff --git a/tests/functional_tests/test_cases/moe2.0/model_configs/qwen3_proxy.yaml b/tests/functional_tests/test_cases/moe2.0/model_configs/qwen3_proxy.yaml
new file mode 100644
index 00000000000..46e298ec971
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe2.0/model_configs/qwen3_proxy.yaml
@@ -0,0 +1,74 @@
+MODEL_ARGS:
+  # Data args
+  --seq-length: 4096
+  --data-cache-path: ${DATA_CACHE_PATH}
+  --data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
+  --vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
+  --merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
+  --split: 949,50,1
+  # Add transformer base args
+  --num-layers: 16
+  --hidden-size: 1024
+  --normalization: RMSNorm
+  --norm-epsilon: 1e-6
+  --disable-bias-linear: true
+  --max-position-embeddings: 4096
+  --make-vocab-size-divisible-by: 3232
+  --untie-embeddings-and-output-weights: true
+  # Add attention related args
+  --group-query-attention: true
+  --num-query-groups: 4
+  --kv-channels: 128
+  --qk-layernorm: true
+  --position-embedding-type: rope
+  --rotary-percent: 1.0
+  --rotary-base: 1000000
+  # Add MLP related args
+  --swiglu: true
+  --ffn-hidden-size: 4096
+  # Add MoE args
+  --num-experts: 32
+  --moe-layer-freq: ([0]*1+[1]*15)
+  --moe-ffn-hidden-size: 1024
+  --moe-shared-expert-intermediate-size: 1024
+  --moe-router-load-balancing-type: aux_loss
+  --moe-router-topk: 4
+  --moe-router-pre-softmax: true
+  --moe-grouped-gemm: true
+  --moe-aux-loss-coeff: 1e-4
+  --moe-router-group-topk: 2
+  --moe-router-num-groups: 4
+  --moe-router-topk-scaling-factor: 2.0
+  --moe-router-score-function: sigmoid
+  --moe-router-enable-expert-bias: true
+  --moe-router-bias-update-rate: 1e-3
+  --moe-router-dtype: fp32
+  # Add regularization args
+  --attention-dropout: 0.0
+  --hidden-dropout: 0.0
+  --clip-grad: 1.0
+  --weight-decay: 0.1
+  # Add learning rate args
+  --lr-warmup-fraction: .01
+  --lr: 0.00015
+  --min-lr: 1.0e-5
+  --lr-decay-style: cosine
+  --adam-beta1: 0.9
+  --adam-beta2: 0.95
+  # Add validation args
+  --eval-iters: 32
+  --eval-interval: 200
+  # Add initialization args
+  --init-method-std: 0.02
+  # Training args
+  --global-batch-size: 32
+  --train-iters: 50
+  --exit-duration-in-mins: 230
+  --no-check-for-nan-in-loss-and-grad: true
+
+METRICS:
+  - "lm loss"
+  - "num-zeros"
+  - "mem-allocated-bytes"
+  - "mem-max-allocated-bytes"
+  - "load_balancing_loss"
diff --git a/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp1pp1ep8.yaml b/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp1pp1ep8.yaml
new file mode 100644
index 00000000000..305e2847305
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp1pp1ep8.yaml
@@ -0,0 +1,41 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NCCL_NVLS_ENABLE: 0
+  PYTHONWARNINGS: ignore
+  NCCL_DEBUG: VERSION
+
+MODEL_ARGS:
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 1
+  --pipeline-model-parallel-size: 1
+  --expert-model-parallel-size: 8
+  --context-parallel-size: 1
+  --expert-tensor-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN
+  --attention-backend: unfused # TODO: switch back to fused attention after fix
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --micro-batch-size: 4
+  # MoE training related args
+  --moe-token-dispatcher-type: alltoall
+  --moe-permute-fusion: true
+  --save-interval: 25
+  # Add mixed precision args
+  --bf16: true
+  --exit-interval: 50
+  # kernel fusion related args
+  --no-rope-fusion: true
+  --cross-entropy-loss-fusion: true
+  --cross-entropy-fusion-impl: native
+  # MISC
+  --manual-gc: true
+  --manual-gc-interval: 100
+TEST_TYPE: resume-ckpt
diff --git a/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp2pp2ep4.yaml b/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp2pp2ep4.yaml
new file mode 100644
index 00000000000..b93862aff8c
--- /dev/null
+++ b/tests/functional_tests/test_cases/moe2.0/runtime_configs/tp2pp2ep4.yaml
@@ -0,0 +1,55 @@
+ENV_VARS:
+  CUDA_DEVICE_MAX_CONNECTIONS: 1
+  NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
+  PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True
+  NCCL_NVLS_ENABLE: 0
+  PYTHONWARNINGS: ignore
+  NCCL_DEBUG: VERSION
+
+MODEL_ARGS:
+  # Transformer Engine args
+  --transformer-impl: transformer_engine
+  # Distributed args
+  --distributed-timeout-minutes: 60
+  --tensor-model-parallel-size: 2
+  --pipeline-model-parallel-size: 2
+  --num-virtual-stages-per-pipeline-rank: 4
+  --expert-model-parallel-size: 4
+  --context-parallel-size: 1
+  --expert-tensor-parallel-size: 1
+  --use-distributed-optimizer: true
+  --overlap-grad-reduce: true
+  --overlap-param-gather: true
+  # Use unfused attention since MLA with fused attention and deterministic mode leads to NaN
+  --attention-backend: unfused # TODO: switch back to fused attention after fix
+  --use-mcore-models: true
+  --sequence-parallel: true
+  --micro-batch-size: 4
+  # MoE training related args
+  --moe-token-dispatcher-type: alltoall
+  --moe-permute-fusion: true
+  # Add checkpointing args
+  --save: ${CHECKPOINT_SAVE_PATH}
+  --load: ${CHECKPOINT_LOAD_PATH}
+  --save-interval: 25
+  # Add logging args
+  --log-timers-to-tensorboard: true
+  --log-memory-to-tensorboard: true
+  --log-num-zeros-in-grad: true
+  --log-params-norm: true
+  --log-validation-ppl-to-tensorboard: true
+  --log-throughput: true
+  --log-interval: 1
+  --logging-level: 40
+  --tensorboard-dir: ${TENSORBOARD_PATH}
+  # Add mixed precision args
+  --bf16: true
+  --exit-interval: 50
+  # kernel fusion related args
+  --no-rope-fusion: true
+  --cross-entropy-loss-fusion: true
+  --cross-entropy-fusion-impl: native
+  # MISC
+  --manual-gc: true
+  --manual-gc-interval: 100
+TEST_TYPE: resume-ckpt
\ No newline at end of file
diff --git a/tests/test_utils/python_scripts/merge_config.py b/tests/test_utils/python_scripts/merge_config.py
new file mode 100644
index 00000000000..176706038b7
--- /dev/null
+++ b/tests/test_utils/python_scripts/merge_config.py
@@ -0,0 +1,92 @@
+"""
+Merges base_config, runtime_config and model_config into one final config that the CI can launch.
+
+Starting Dec 19th 2025 MCore CI supports a new format of defining tests. We are decoupling the test
+config into a modular system of base_config, model_config and runtime_config. This allows us to
+re-use and parametrize a given model easily with multiple runtime configs, like parallelism settings.
+
+With this DRY principle, we simplify test maintenance and reduce the amount of code duplication.
+
+This refactoring is fully compliant with the original CI system as we merge the three configs into one
+final config that the CI can launch.
+
+Precendence: Base config > Model config > Runtime config.
+
+Usage:
+
+python merge_config.py \
+    --model_config model_config.yaml \
+    --base_config base_config.yaml \
+    --runtime_config runtime_config.yaml \
+    --output_config output_config.yaml
+"""
+
+import logging
+
+import click
+import yaml
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+@click.command()
+@click.option("--model_config", type=str, help="Model config to merge")
+@click.option("--base_config", type=str, help="Base config to merge")
+@click.option("--runtime_config", type=str, help="Run time config to merge")
+@click.option("--output_config", type=str, help="Output config to merge")
+def main(model_config, base_config, runtime_config, output_config):
+
+    with open(model_config, "r") as f:
+        model_config = yaml.safe_load(f)
+    with open(base_config, "r") as f:
+        base_config = yaml.safe_load(f)
+    with open(runtime_config, "r") as f:
+        runtime_config = yaml.safe_load(f)
+
+    config = {}
+
+    # Collect all top-level keys (ENV_VARS, MODEL_ARGS, etc.)
+    all_keys = set(base_config.keys()) | set(model_config.keys()) | set(runtime_config.keys())
+
+    for key in all_keys:
+        base_val = base_config.get(key)
+        model_val = model_config.get(key)
+        runtime_val = runtime_config.get(key)
+
+        # Get first non-None value to check type
+        first_val = base_val or model_val or runtime_val
+
+        if isinstance(first_val, dict):
+            # Merge dicts
+            config[key] = {}
+            for val in [base_val, model_val, runtime_val]:
+                if val:
+                    config[key].update(val)
+        elif isinstance(first_val, list):
+            # Concatenate lists (deduplicate while preserving order)
+            config[key] = []
+            seen = set()
+            for val in [base_val, model_val, runtime_val]:
+                if val:
+                    for item in val:
+                        if item not in seen:
+                            config[key].append(item)
+                            seen.add(item)
+        else:
+            # Scalar value (string, int, bool, etc.) - use last defined
+            if runtime_val is not None:
+                config[key] = runtime_val
+            elif model_val is not None:
+                config[key] = model_val
+            else:
+                config[key] = base_val
+
+    with open(output_config, "w") as f:
+        yaml.dump(config, f)
+
+    logger.info(f"Config merged and saved to {output_config}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_utils/python_scripts/recipe_parser.py b/tests/test_utils/python_scripts/recipe_parser.py
index 394bda30a01..a04340407e3 100644
--- a/tests/test_utils/python_scripts/recipe_parser.py
+++ b/tests/test_utils/python_scripts/recipe_parser.py
@@ -100,11 +100,16 @@ def load_and_flatten(config_path: str) -> List[dotdict]:
 
 def filter_by_test_case(workload_manifests: List[dotdict], test_case: str) -> Optional[dotdict]:
     """Returns a workload with matching name. Raises an error if there no or more than a single workload."""
+    print(len(workload_manifests))
     workload_manifests = list(
         workload_manifest
         for workload_manifest in workload_manifests
         if workload_manifest["spec"]["test_case"] == test_case
     )
+    print(len(workload_manifests))
+
+    for w in workload_manifests:
+        print(w["spec"]["test_case"])
 
     if len(workload_manifests) > 1:
         logger.info("Duplicate test_case found!")
diff --git a/tests/test_utils/recipes/gb200/gpt.yaml b/tests/test_utils/recipes/gb200/gpt.yaml
index 7c23813ae63..e10cce0cc3c 100644
--- a/tests/test_utils/recipes/gb200/gpt.yaml
+++ b/tests/test_utils/recipes/gb200/gpt.yaml
@@ -161,7 +161,7 @@ products:
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1]
     products:
       - environment: [dev]
-        scope: [nightly]
+        scope: [broken]
         platforms: [dgx_gb200]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_resume_torch_decoupled_lr]
     products:
@@ -186,7 +186,7 @@ products:
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_tunable_overlap]
     products:
       - environment: [dev]
-        scope: [nightly]
+        scope: [broken]
         platforms: [dgx_gb200]
   - test_case: [gpt3_mcore_te_tp1_pp4_vp1_uneven_pipeline]
     products:
diff --git a/tests/test_utils/recipes/gb200/moe.yaml b/tests/test_utils/recipes/gb200/moe.yaml
index eafcb874f4c..c08bef5cd16 100644
--- a/tests/test_utils/recipes/gb200/moe.yaml
+++ b/tests/test_utils/recipes/gb200/moe.yaml
@@ -140,12 +140,12 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM]
     products:
       - environment: [dev]
-        scope: [nightly]
+        scope: [nightly-broken]
         platforms: [dgx_gb200]
   - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4]
     products:
       - environment: [dev]
-        scope: [nightly]
+        scope: [nightly-broken]
         platforms: [dgx_gb200]
   - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router]
     products:
@@ -165,7 +165,7 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective]
     products:
       - environment: [dev]
-        scope: [nightly]
+        scope: [nightly-broken]
         platforms: [dgx_gb200]
   - test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_optimizer]
     products:
@@ -211,7 +211,7 @@ products:
   - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer]
     products:
       - environment: [dev]
-        scope: [mr]
+        scope: [mr-broken]
         platforms: [dgx_gb200]
   - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed]
     products:
diff --git a/tests/test_utils/recipes/h100/gpt.yaml b/tests/test_utils/recipes/h100/gpt.yaml
index 52e38760f84..5da053b793d 100644
--- a/tests/test_utils/recipes/h100/gpt.yaml
+++ b/tests/test_utils/recipes/h100/gpt.yaml
@@ -313,6 +313,11 @@ products:
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
+  - test_case: [gpt3_mcore_te_tp2_pp1_cp4_dcp]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-github-slim]
+        platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_cp2_etp4_calculate_per_token_loss_nondeterministic_dp_last]
     products:
       - environment: [dev]
@@ -347,11 +352,21 @@ products:
       - environment: [dev]
         scope: [mr, mr-github, mr-github-slim]
         platforms: [dgx_h100]
+  - test_case: [gpt3_mcore_te_tp2_pp2_mhc]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github]
+        platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_mla]
     products:
       - environment: [dev]
         scope: [mr, mr-github]
         platforms: [dgx_h100]
+  - test_case: [gpt3_mcore_te_tp2_pp2_dsa]
+    products:
+      - environment: [dev]
+        scope: [mr, mr-github, mr-github-slim]
+        platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_resume_torch_dist_ddp_average_in_collective]
     products:
       - environment: [dev]
diff --git a/tests/test_utils/recipes/h100/mamba-static-inference.yaml b/tests/test_utils/recipes/h100/mamba-static-inference.yaml
index b36c4a8f765..ae9692b4edc 100644
--- a/tests/test_utils/recipes/h100/mamba-static-inference.yaml
+++ b/tests/test_utils/recipes/h100/mamba-static-inference.yaml
@@ -60,8 +60,8 @@ products:
       - environment: [dev]
         scope: [mr-broken, mr-github-broken]
         platforms: [dgx_h100]
-  - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs]
-    products:
-      - environment: [dev]
-        scope: [mr]
-        platforms: [dg  x_h100]
+  # - test_case: [hybrid_static_inference_tp1_pp1_2B_cudagraphs]
+  #   products:
+  #     - environment: [dev]
+  #       scope: [mr]
+  #       platforms: [dgx_h100] # Broken after dev2main sync 01/27
diff --git a/tests/test_utils/recipes/h100/moe.yaml b/tests/test_utils/recipes/h100/moe.yaml
index 6aeee162ab8..f951dfd9da0 100644
--- a/tests/test_utils/recipes/h100/moe.yaml
+++ b/tests/test_utils/recipes/h100/moe.yaml
@@ -149,7 +149,7 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_overlap_grad_reduce_param_gather_groupedGEMM]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr-broken, mr-github-broken]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts_etp1_ep4]
     products:
@@ -161,7 +161,7 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp1_te_a2a_ovlp_8experts_etp1_ep4]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr-broken, mr-github-broken]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_zp_z3_resume_torch_dist_te_8experts2parallel_top2router]
     products:
@@ -171,7 +171,7 @@ products:
   - test_case: [gpt3_mcore_te_tp2_pp1_te_8experts2parallel_ddp_average_in_collective]
     products:
       - environment: [dev]
-        scope: [mr, mr-github]
+        scope: [mr-broken, mr-github-broken]
         platforms: [dgx_h100]
       - environment: [lts]
         scope: [nightly]
@@ -224,7 +224,7 @@ products:
   - test_case: [gpt3_moe_mcore_te_tp4_ep2_etp2_pp2_resume_torch_dist_dist_optimizer]
     products:
       - environment: [dev]
-        scope: [mr, mr-github, mr-github-slim]
+        scope: [mr-broken, mr-github-broken, mr-github-slim-broken]
         platforms: [dgx_h100]
   - test_case: [gpt3_mcore_te_tp2_pp2_ep4_etp1_memory_speed]
     products:
diff --git a/tests/test_utils/recipes/h100/t5.yaml b/tests/test_utils/recipes/h100/t5.yaml
deleted file mode 100644
index 1761cd3f1e6..00000000000
--- a/tests/test_utils/recipes/h100/t5.yaml
+++ /dev/null
@@ -1,113 +0,0 @@
-type: basic
-format_version: 1
-maintainers: [mcore]
-loggers: [stdout]
-spec:
-  name: '{test_case}_{environment}_{platforms}'
-  model: t5
-  build: mcore-pyt-{environment}
-  nodes: 1
-  gpus: 8
-  platforms: dgx_a100
-  script_setup: |
-    unset https_proxy
-    echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc
-
-    # Checkout latest
-    cd /opt
-    rm -rf /opt/megatron-lm; mkdir megatron-lm; cd megatron-lm
-    git init
-    git remote add origin $MCORE_REPO
-    git fetch origin '+refs/merge-requests/*:refs/remotes/merge-requests/*'
-    git fetch origin $MCORE_MR_COMMIT
-    git checkout $MCORE_MR_COMMIT
-    git rev-parse HEAD
-
-    # Checkout backwards-ref
-    cd /opt
-    rm -rf /opt/megatron-lm-legacy; mkdir megatron-lm-legacy; cd megatron-lm-legacy
-    git init
-    git remote add origin $MCORE_REPO
-    git fetch origin $MCORE_BACKWARDS_COMMIT
-    git checkout $MCORE_BACKWARDS_COMMIT
-    git rev-parse HEAD
-    rm -rf megatron; cp -a /opt/megatron-lm/megatron ./
-  script: |-
-    ls
-    cd /opt/megatron-lm
-
-    NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g')
-
-    ARGUMENTS=(
-        "DATA_PATH=/mnt/artifacts"
-        "DATA_CACHE_PATH=/workspace/data/cache"
-        "OUTPUT_PATH={assets_dir}"
-        "TENSORBOARD_PATH={assets_dir}/tensorboard"
-        "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
-        "CHECKPOINT_LOAD_PATH=/mnt/artifacts"
-        "TRAINING_SCRIPT_PATH=pretrain_t5.py"
-        "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
-        "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
-        "N_REPEAT={n_repeat}"
-        "ENABLE_LIGHTWEIGHT_MODE=${{ENABLE_LIGHTWEIGHT_MODE}}"
-        "RECORD_CHECKPOINTS=${{RECORD_CHECKPOINTS}}"
-    )
-
-    bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
-
-products:
-  - test_case: [t5_11b_mcore_tp4_pp1]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_h100]
-  - test_case: [t5_mcore_te_tp4_pp1]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_h100]
-  - test_case: [t5_mcore_te_tp4_pp1_resume_torch_dist]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_h100]
-  - test_case: [t5_mcore_tp4_pp1]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_h100]
-  - test_case: [t5_mcore_tp4_pp1_resume_torch_dist]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_h100]
-  - test_case: [t5_mcore_te_tp1_pp1_vp1_resume_torch]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_a100, dgx_h100]
-  - test_case: [t5_mcore_te_tp2_pp1_vp1]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_a100, dgx_h100]
-  - test_case: [t5_mcore_te_tp2_pp1_vp1_sequence_parallel]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_a100, dgx_h100]
-  - test_case: [t5_mcore_tp1_pp1_vp1]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_a100, dgx_h100]
-  - test_case: [t5_mcore_tp1_pp1_vp1_resume_torch]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_a100, dgx_h100]
-  - test_case: [t5_mcore_tp2_pp1_vp1]
-    products:
-      - environment: [dev]
-        scope: [nightly]
-        platforms: [dgx_a100, dgx_h100]
diff --git a/tests/test_utils/recipes/h100/bert.yaml b/tests/test_utils/recipes/moe2.0.yaml
similarity index 59%
rename from tests/test_utils/recipes/h100/bert.yaml
rename to tests/test_utils/recipes/moe2.0.yaml
index 89499f93c5e..39fccd08c40 100644
--- a/tests/test_utils/recipes/h100/bert.yaml
+++ b/tests/test_utils/recipes/moe2.0.yaml
@@ -3,14 +3,13 @@ format_version: 1
 maintainers: [mcore]
 loggers: [stdout]
 spec:
-  name: '{test_case}_{environment}_{platforms}'
-  model: bert
-  nodes: 1
+  name: "{test_case}_{environment}_{platforms}"
+  model: moe2.0
   build: mcore-pyt-{environment}
+  nodes: 1
   gpus: 8
+  n_repeat: 5
   platforms: dgx_a100
-  time_limit:
-  n_repeat:
   script_setup: |
     unset https_proxy
     echo "machine gitlab-master.nvidia.com login okoenig password $RO_API_TOKEN" | tee -a /root/.netrc
@@ -37,15 +36,24 @@ spec:
   script: |-
     ls
     cd /opt/megatron-lm
+
     NAME=$(echo {test_case}_{environment} | sed 's/dgx_h100/dgx_a100/g')
+
+    mkdir -p $(dirname ./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml)
+    python ./tests/test_utils/python_scripts/merge_config.py \
+      --base_config ./tests/functional_tests/test_cases/ci_base_config.yml \
+      --model_config ./tests/functional_tests/test_cases/{model}/model_configs/{model_config}.yaml \
+      --runtime_config ./tests/functional_tests/test_cases/{model}/runtime_configs/{runtime_config}.yaml \
+      --output_config ./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml
+
     ARGUMENTS=(
         "DATA_PATH=/mnt/artifacts"
-        "DATA_CACHE_PATH=/workspace/data/cache" 
+        "DATA_CACHE_PATH=/workspace/data/cache"
         "OUTPUT_PATH={assets_dir}"
         "TENSORBOARD_PATH={assets_dir}/tensorboard"
         "CHECKPOINT_SAVE_PATH={artifacts_dir}/checkpoints"
         "CHECKPOINT_LOAD_PATH=/mnt/artifacts"
-        "TRAINING_SCRIPT_PATH=pretrain_bert.py"
+        "TRAINING_SCRIPT_PATH=pretrain_gpt.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}_{platforms}.json"
         "N_REPEAT={n_repeat}"
@@ -56,38 +64,49 @@ spec:
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
 
 products:
-  - test_case: [bert_mcore_tp2_pp2]
+  ###########################
+  # Merge train tests       #
+  ###########################
+  - test_case: [dsv3_tp1pp1ep8]
     products:
-      - environment: [dev]
-        scope: [nightly]
+      - model_config: dsv3_proxy
+        runtime_config: tp1pp1ep8
+        environment: [dev]
+        scope: [nightly-broken]
         platforms: [dgx_h100]
-  - test_case: [bert_mcore_tp2_pp2_local_spec]
+  - test_case: [dsv3_tp2pp2ep4]
     products:
-      - environment: [dev]
-        scope: [nightly]
+      - model_config: dsv3_proxy
+        runtime_config: tp2pp2ep4
+        environment: [dev]
+        scope: [nightly-broken]
         platforms: [dgx_h100]
-  - test_case: [bert_mcore_tp2_pp2_resume_torch_dist]
+  - test_case: [qwen3_tp1pp1ep1]
     products:
-      - environment: [dev]
-        scope: [nightly]
+      - model_config: qwen3_proxy
+        runtime_config: tp1pp1ep1
+        environment: [dev]
+        scope: [nightly-broken]
         platforms: [dgx_h100]
-  - test_case: [bert_mcore_tp2_pp2_resume_torch_dist_local_spec]
+  - test_case: [qwen3_tp2pp2ep4]
     products:
-      - environment: [dev]
-        scope: [nightly]
+      - model_config: qwen3_proxy
+        runtime_config: tp2pp2ep4
+        environment: [dev]
+        scope: [nightly-broken]
         platforms: [dgx_h100]
   - test_case: [bert_mcore_tp1_pp2]
     products:
       - environment: [dev]
-        scope: [nightly]
+        scope: [nightly-broken]
         platforms: [dgx_h100]
   - test_case: [bert_mcore_tp1_pp4_vp2]
     products:
       - environment: [dev]
-        scope: [nightly]
+        scope: [nightly-broken]
         platforms: [dgx_h100]
   - test_case: [bert_mcore_tp4_pp1]
     products:
       - environment: [dev]
-        scope: [nightly]
+        scope: [nightly-broken]
         platforms: [dgx_h100]
diff --git a/tests/unit_tests/distributed/megatron_fsdp/test_mcore_fully_sharded_data_parallel.py b/tests/unit_tests/distributed/megatron_fsdp/test_mcore_fully_sharded_data_parallel.py
index 04f7c5c6482..500045871e7 100644
--- a/tests/unit_tests/distributed/megatron_fsdp/test_mcore_fully_sharded_data_parallel.py
+++ b/tests/unit_tests/distributed/megatron_fsdp/test_mcore_fully_sharded_data_parallel.py
@@ -736,12 +736,21 @@ def _training_loop(seed=42, **kwargs):
             train_iters=NUM_TRAINING_STEPS,
             **kwargs,
         )
-        if kwargs.get("use_megatron_fsdp", False) and kwargs.get(
+        megatron_fsdp_te_fused_adam = kwargs.get("use_megatron_fsdp", False) and kwargs.get(
             "use_precision_aware_optimizer", False
-        ):
+        )
+        if megatron_fsdp_te_fused_adam:
             assert (
                 not optim.optimizer.master_weights
             ), "Megatron-FSDP should not use FusedAdam master weights."
+            assert (
+                optim.optimizer.use_decoupled_grad
+            ), "Megatron-FSDP should be using a decoupled gradient with FusedAdam."
+            assert model_chunks[
+                0
+            ].module.param_and_grad_buffer.use_decoupled_grad, (
+                "Megatron-FSDP is installing gradients into param.decoupled_grad."
+            )
 
         # Prepare data iterator
         data_iterator = make_gpt_mock_data_iterator(
@@ -764,6 +773,17 @@ def _training_loop(seed=42, **kwargs):
                 micro_batch_size=MICRO_BATCH_SIZE,
                 num_micro_batches=GLOBAL_BATCH_SIZE // MICRO_BATCH_SIZE // DP_GROUP.size(),
             )
+            # Check that at least one non-null / non-zero gradient
+            # exists when using Megatron-FSDP.
+            if kwargs.get("use_megatron_fsdp", False):
+                grad_attr = "decoupled_grad" if megatron_fsdp_te_fused_adam else "grad"
+                assert any(
+                    [
+                        getattr(p, grad_attr, None) is not None
+                        and getattr(p, grad_attr, None)._local_tensor.any()
+                        for p in model_chunks[0].parameters()
+                    ]
+                ), f"[Megatron-FSDP] Missing gradient in Parameter.{grad_attr}..."
             optim.step()
 
             # Collect loss
diff --git a/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
new file mode 100644
index 00000000000..3ac8e7f6200
--- /dev/null
+++ b/tests/unit_tests/fusions/test_fused_linear_cross_entropy.py
@@ -0,0 +1,1509 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+
+import contextlib
+import os
+import typing
+from contextlib import ExitStack
+from dataclasses import dataclass
+
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.distributed import DistributedSampler
+
+import megatron.core.parallel_state as ps
+from megatron.core.fusions.fused_linear_cross_entropy import linear_cross_entropy
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_decoder_block_spec,
+    get_gpt_mtp_block_spec,
+)
+from megatron.core.models.gpt.gpt_model import GPTModel
+from megatron.training.utils import get_device_arch_version
+from tests.unit_tests.a2a_overlap.utils import (
+    deterministic_mode,
+    get_test_config,
+    get_valid_fp8_flags,
+    get_valid_token_dispatcher_types,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+# 1. Define a standardized context to hold your distributed info
+@dataclass
+class DistContext:
+    rank: int
+    world_size: int
+    group: dist.ProcessGroup
+    is_chief: bool
+
+
+# 2. Create a module-scoped fixture
+# This runs ONE time per file, no matter how many test classes you have.
+@pytest.fixture(scope="module")
+def distributed_context():
+    # --- PRE-CHECK ---
+    if "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2:
+        pytest.skip("Requires torchrun with multiple GPUs (WORLD_SIZE >= 2)")
+
+    # --- SETUP ---
+    is_external_init = dist.is_initialized()
+
+    if not is_external_init:
+        # Initialize only if not already done (e.g., by another test runner)
+        dist.init_process_group(
+            backend="nccl",
+            init_method="env://",
+            world_size=int(os.environ["WORLD_SIZE"]),
+            rank=int(os.environ["RANK"]),
+        )
+
+    # Set device immediately to avoid cross-device pollution
+    local_rank = int(os.environ.get("LOCAL_RANK", os.environ["RANK"]))
+    device = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(device)
+
+    # Gather context data
+    rank = dist.get_rank()
+    world_size = dist.get_world_size()
+    group = dist.group.WORLD
+
+    print(f"[INFO]: Initialized Rank: {rank} / {world_size}")
+
+    context = DistContext(rank=rank, world_size=world_size, group=group, is_chief=(rank == 0))
+
+    # Yield control to the tests
+    yield context
+
+    # --- TEARDOWN ---
+    # Only destroy if we were the ones who initialized it
+    if not is_external_init:
+        dist.destroy_process_group()
+
+
+class MockDataset(Dataset):
+    """
+    Mock dataset for torchtitan GPT training tests
+    Generates synthetic tokenized sequences on-the-fly
+    """
+
+    def __init__(
+        self,
+        num_samples=10000,
+        micro_batch_size=4,
+        sequence_length=2048,
+        vocab_size=128256,
+        seed=42,
+    ):
+        """
+        Initialize mock dataset
+
+        Args:
+            num_samples: Total number of samples
+            sequence_length: Length of each sequence
+            vocab_size: Size of vocabulary
+            seed: Random seed for reproducibility
+        """
+        self.num_samples = num_samples
+        self.micro_batch_size = micro_batch_size
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.seed = seed
+
+        # Set numpy seed for deterministic generation
+        np.random.seed(seed)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        """
+        Generate a single training sample
+
+        Returns:
+            dict with 'tokens' and 'labels'
+        """
+        # Use idx as seed for reproducible but varied samples
+        rng = np.random.RandomState(self.seed + idx)
+
+        # Generate random token sequence
+        tokens = rng.randint(0, self.vocab_size, size=self.sequence_length, dtype=np.int64)
+
+        # Labels are tokens shifted by 1 (next token prediction)
+        labels = rng.randint(0, self.vocab_size, size=self.sequence_length, dtype=np.int64)
+
+        return {
+            'input_ids': torch.from_numpy(tokens.copy()),
+            'labels': torch.from_numpy(labels.copy()),
+            "attention_mask": torch.ones(
+                (1, self.sequence_length, self.sequence_length), dtype=bool
+            ),
+        }
+
+
+def build_model(config):
+    max_seq_len = 300
+
+    # build layer spec
+    transformer_layer_spec = get_gpt_decoder_block_spec(config=config, use_transformer_engine=True)
+    mtp_block_spec = get_gpt_mtp_block_spec(config, transformer_layer_spec.layer_specs[-1], True)
+
+    # build model
+    gpt_model = GPTModel(
+        config=config,
+        transformer_layer_spec=transformer_layer_spec,
+        mtp_block_spec=mtp_block_spec,
+        vocab_size=100,
+        pre_process=True,
+        post_process=True,
+        max_sequence_length=max_seq_len,
+    )
+    return gpt_model
+
+
+# Define a reusable context manager
+@contextlib.contextmanager
+def init_model_parallel(tp=1, pp=1, ep=1):
+    try:
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp,
+            pipeline_model_parallel_size=pp,
+            expert_model_parallel_size=ep,
+        )
+        yield
+    finally:
+        Utils.destroy_model_parallel()
+
+
+def init_gpt_dataloader(
+    dp_group, micro_batch_size=1, vocab_size=50257, sequence_length=128, batch_size=8
+):
+    dataset = MockDataset(
+        num_samples=1000,
+        micro_batch_size=micro_batch_size,
+        sequence_length=sequence_length,
+        vocab_size=vocab_size,
+        seed=42,
+    )
+    sampler = DistributedSampler(dataset, num_replicas=dp_group.size(), rank=dp_group.rank())
+    dataloader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
+    return dataloader
+
+
+# skip it for good
+@pytest.mark.skipif(
+    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2) or True,
+    reason="Requires torchrun with multiple GPUs",
+)
+class TestFusedLinearCrossEntropyOnGptModel:
+    @pytest.mark.parametrize("fp8_flag", get_valid_fp8_flags())
+    @pytest.mark.parametrize("mtp_layers", [0, 1])
+    @pytest.mark.parametrize("dispatcher_type", get_valid_token_dispatcher_types())
+    @pytest.mark.parametrize("layer_num", [2])
+    def test_gpt_model(self, mtp_layers, dispatcher_type, fp8_flag, layer_num):
+        with ExitStack() as stack:
+            gpu_count = torch.cuda.device_count()
+            tp = min(2, gpu_count)
+            ep = gpu_count // tp
+            stack.enter_context(init_model_parallel(tp=tp, ep=ep))
+            stack.enter_context(deterministic_mode())
+
+            # create TransformerConfig
+            extra_kwargs = {
+                "moe_token_dispatcher_type": dispatcher_type,
+                "sequence_parallel": tp > 1,
+                "tensor_model_parallel_size": tp,
+            }
+            if dispatcher_type == "flex":
+                extra_kwargs["moe_enable_deepep"] = True
+                extra_kwargs["moe_router_dtype"] = "fp32"
+            if fp8_flag is not None:
+                extra_kwargs["fp8"] = fp8_flag[0]
+                extra_kwargs["fp8_recipe"] = fp8_flag[1]
+            if mtp_layers > 0:
+                extra_kwargs["mtp_num_layers"] = mtp_layers
+                extra_kwargs["mtp_loss_scaling_factor"] = 1.1
+
+            # build config
+            config = get_test_config(num_layers=layer_num, extra_kwargs=extra_kwargs)
+            config.expert_model_parallel_size = ep
+
+            # build model
+            gpt_model = build_model(config)
+            gpt_model.cuda()
+
+            dataloader = init_gpt_dataloader(
+                ps.get_data_parallel_group(),
+                vocab_size=gpt_model.vocab_size,
+                micro_batch_size=1,
+                sequence_length=gpt_model.max_sequence_length,
+                batch_size=4,
+            )
+            # for batch in dataloder:
+            for batch in dataloader:
+                batch["position_ids"] = torch.arange(
+                    gpt_model.max_sequence_length, dtype=torch.int64
+                )
+                batch = {k: v.cuda() for k, v in batch.items()}
+                gpt_model.zero_grad()
+                output = gpt_model(**batch)
+                loss = output.sum()
+                loss.backward()
+
+
+@pytest.mark.skipif(
+    "WORLD_SIZE" in os.environ and os.environ["WORLD_SIZE"] != "1", reason="Requires single GPU"
+)
+@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10")
+class TestFusedLinearCrossEntropyDataParallel:
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        import gc
+
+        gc.collect()
+        torch.cuda.synchronize()
+
+    @staticmethod
+    def torch_linear_cross_entropy(
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        reduction: str,
+        ignore_index: int,
+    ):
+        # NOTE: need to convert to fp32 to fp32 accumulation,
+        # thus assure accuracy
+        logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+        logprobs = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.shape[-1]),
+            labels.view(-1),
+            reduction=reduction,
+            ignore_index=ignore_index,
+        )
+        return logprobs.to(torch.float32)
+
+    @staticmethod
+    def get_problems():
+        return [
+            (80, 125, 64),
+            (80, 152064, 64),
+            (1024, 152064, 4096),
+            (4096, 152063, 8192),
+            ((1, 4096), 152064, 8192),
+            ((2, 4096), 152064, 8192),
+        ]
+
+    @staticmethod
+    def get_ignore_index():
+        return [-100, 4]
+
+    def test_kernel_launch(self):
+        """
+        Check if the compiled kernel can be
+        launched with different problem sizes
+        """
+        self.cleanup()
+
+        num_tokens = [15, 26, 128, 513, 2048, 8192]
+        vocab_size = 152064
+        dim = 4096
+        dtype = torch.bfloat16
+        reduction = "mean"
+        ignore_index = -100
+
+        weight = torch.randn(vocab_size, dim, dtype=dtype, device="cuda").requires_grad_()
+        for num_token in num_tokens:
+            hidden = torch.randn(num_token, dim, dtype=dtype, device="cuda").requires_grad_()
+            labels = torch.randint(0, vocab_size, (num_token,), dtype=torch.long, device="cuda")
+
+            logprobs = linear_cross_entropy(
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+            )
+            assert not torch.isnan(logprobs).any()
+
+            gLogprobs = torch.randn_like(logprobs)
+            (d_hidden, d_weight) = torch.autograd.grad(
+                (logprobs,), (hidden, weight), (gLogprobs,), retain_graph=False
+            )
+            assert not torch.isnan(d_hidden).any()
+            assert not torch.isnan(d_weight).any()
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("problem", get_problems())
+    @pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
+    @pytest.mark.parametrize("ignore_index", get_ignore_index())
+    def test_correctness(self, dtype, problem, reduction, ignore_index):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        hidden = (
+            torch.empty(hidden_shape, dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+        if ignore_index >= 0 and ignore_index < vocabsize:
+            pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
+            labels = pad_labels[..., 1:].contiguous()
+
+        # forward
+        torch_logprobs = self.torch_linear_cross_entropy(
+            hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+        )
+
+        custom_logprobs = linear_cross_entropy(
+            hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+        )
+
+        torch.testing.assert_close(torch_logprobs, custom_logprobs)
+
+        # backward
+        g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+
+        (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+            (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+        )
+
+        (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
+            (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+        )
+
+        torch.testing.assert_close(d_torch_hidden, d_custom_hidden, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(d_torch_weight, d_custom_weight, atol=1e-3, rtol=1e-3)
+
+    @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    @pytest.mark.parametrize("ignore_index", [-100])
+    def test_performance(self, problem, dtype, reduction, ignore_index):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        torch_fwd_latency = list()
+        torch_bwd_latency = list()
+        custom_fwd_latency = list()
+        custom_bwd_latency = list()
+
+        iterations = 5
+        for i in range(iterations):
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+            if ignore_index >= 0 and ignore_index < vocabsize:
+                pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
+                labels = pad_labels[..., 1:].contiguous()
+
+            # -------- forward -------- #
+            start_event.record()
+            torch_logprobs = self.torch_linear_cross_entropy(
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            # -------- backward -------- #
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+
+            start_event.record()
+            (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_bwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_bwd_latency.append(start_event.elapsed_time(end_event))
+
+        # --- remove first latency due to warmup --- #
+        torch_fwd_latency = torch_fwd_latency[1:]
+        torch_bwd_latency = torch_bwd_latency[1:]
+        custom_fwd_latency = custom_fwd_latency[1:]
+        custom_bwd_latency = custom_bwd_latency[1:]
+
+        print()
+        print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}:")
+        print(
+            f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms"
+        )
+
+    @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    @pytest.mark.parametrize("ignore_index", [-100])
+    def test_storage(self, problem, dtype, reduction, ignore_index):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+        print()
+        print(f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}:")
+
+        def torch_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+            if ignore_index >= 0 and ignore_index < vocabsize:
+                pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
+                labels = pad_labels[..., 1:].contiguous()
+
+            torch.cuda.reset_peak_memory_stats()
+            torch_logprobs = self.torch_linear_cross_entropy(
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            print(f"[INFO]: Torch Forward pass peak memory: {torch_max_memory:.2f} MB")
+
+            torch.cuda.reset_peak_memory_stats()
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+            (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            torch.cuda.synchronize()
+            torch_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            print(f"[INFO]: Torch Backward pass peak memory: {torch_backward_max_memory:.2f} MB")
+
+        def custom_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+            if ignore_index >= 0 and ignore_index < vocabsize:
+                pad_labels = torch.nn.functional.pad(labels, (0, 1), value=ignore_index)
+                labels = pad_labels[..., 1:].contiguous()
+
+            torch.cuda.reset_peak_memory_stats()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels, reduction=reduction, ignore_index=ignore_index
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            print(f"[INFO]: Custom Forward pass peak memory: {custom_max_memory:.2f} MB")
+
+            torch.cuda.reset_peak_memory_stats()
+            g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1)
+            (d_custom_hidden, d_custom_weight) = torch.autograd.grad(
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            torch.cuda.synchronize()
+            custom_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            print(f"[INFO]: Custom Backward pass peak memory: {custom_backward_max_memory:.2f} MB")
+
+        self.cleanup()
+        torch_storage()
+        self.cleanup()
+        custom_storage()
+
+
+@pytest.mark.skipif(
+    ("WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2),  # or True,
+    reason="Requires torchrun with multiple GPUs",
+)
+@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10")
+@pytest.mark.usefixtures("distributed_context")
+class TestFusedLinearCrossEntropyTensorParallel:
+    @pytest.fixture(autouse=True)
+    def setup_attrs(self, distributed_context):
+        """
+        Setup attributes for the test class.
+        """
+        self.tp_group = distributed_context.group
+        self.tp_rank = distributed_context.rank
+        self.tp_world_size = distributed_context.world_size
+        self.is_chief = distributed_context.is_chief
+
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        import gc
+
+        gc.collect()
+        torch.cuda.synchronize()
+
+    @staticmethod
+    def torch_linear_cross_entropy_single_gpu(
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        reduction: typing.Optional[str] = "mean",
+    ):
+        logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+        logprobs = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.shape[-1]), labels.view(-1), reduction=reduction
+        )
+        return logprobs.to(torch.float32)
+
+    class TorchLinearCrossEntropy(torch.autograd.Function):
+        @staticmethod
+        def forward(
+            ctx,
+            hidden: torch.Tensor,
+            weight: torch.Tensor,
+            labels: torch.Tensor,
+            tp_group: torch.distributed.ProcessGroup,
+            reduction: typing.Optional[str] = "mean",
+        ):
+            tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
+            tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
+
+            logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+
+            whole_logits = torch.empty(
+                (logits.shape[0], logits.shape[-1] * tp_world_size),
+                dtype=logits.dtype,
+                device=logits.device,
+            )
+            whole_logits_ref = [
+                whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
+                for i in range(tp_world_size)
+            ]
+            dist.all_gather(whole_logits_ref, logits, group=tp_group)
+
+            logprobs = torch.nn.functional.cross_entropy(
+                whole_logits.view(-1, whole_logits.shape[-1]), labels.view(-1), reduction=reduction
+            )
+
+            # If we don't preserve whole_logits,
+            # we need to re-compute it in the backward pass
+            ctx.save_for_backward(hidden, weight, labels)
+            ctx.tp_group = tp_group
+            ctx.reduction = reduction
+            ctx.tp_rank = tp_rank
+            ctx.tp_world_size = tp_world_size
+
+            return logprobs.to(torch.float32)
+
+        @staticmethod
+        def backward(ctx, g_logprobs: torch.Tensor):
+            hidden, weight, labels = ctx.saved_tensors
+            tp_group = ctx.tp_group
+            reduction = ctx.reduction
+            tp_rank = ctx.tp_rank
+            tp_world_size = ctx.tp_world_size
+
+            num_tokens, dim = hidden.shape
+
+            if reduction == "mean":
+                _g_logprobs = torch.broadcast_to(g_logprobs / num_tokens, (num_tokens,))
+            elif reduction == "sum":
+                _g_logprobs = torch.broadcast_to(g_logprobs, (num_tokens,))
+            else:
+                _g_logprobs = g_logprobs
+
+            # re-compute whole_logits
+            logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+            whole_logits = torch.empty(
+                (logits.shape[0], logits.shape[-1] * tp_world_size),
+                dtype=logits.dtype,
+                device=logits.device,
+            )
+            whole_logits_ref = [
+                whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
+                for i in range(tp_world_size)
+            ]
+            dist.all_gather(whole_logits_ref, logits, group=tp_group)
+
+            one_hot = torch.zeros_like(whole_logits)
+            one_hot.scatter_(1, labels.view(-1).unsqueeze(-1), 1)
+
+            pd = torch.nn.functional.softmax(whole_logits, dim=-1)
+            d_logits = (pd - one_hot) * _g_logprobs.unsqueeze(-1)
+            d_logits = d_logits.to(hidden.dtype)
+
+            local_size = weight.size(0)
+            local_d_logits = d_logits[:, tp_rank * local_size : (tp_rank + 1) * local_size]
+
+            local_d_hidden = local_d_logits @ weight
+            local_d_weight = local_d_logits.T @ hidden
+
+            dist.all_reduce(local_d_hidden, op=dist.ReduceOp.SUM, group=tp_group)
+
+            return local_d_hidden, local_d_weight, None, None, None
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+    @pytest.mark.parametrize("problem", [(4096, 129280, 8192)])
+    def test_torch_tp_vs_single_gpu(self, dtype, reduction, problem):
+        num_tokens, vocabsize, dim = problem
+        vocabsize = vocabsize // self.tp_world_size
+
+        hidden = (
+            torch.empty((num_tokens, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, (num_tokens,), dtype=torch.long, device="cuda")
+
+        # ------------ forward pass ------------ #
+        dist.broadcast(hidden, src=0, group=self.tp_group)
+        dist.broadcast(labels, src=0, group=self.tp_group)
+
+        # single GPU
+        whole_weight = torch.empty(
+            (vocabsize * self.tp_world_size, dim), dtype=dtype, device="cuda"
+        )
+        whole_weight_view = [
+            whole_weight[i * vocabsize : (i + 1) * vocabsize, :] for i in range(self.tp_world_size)
+        ]
+        dist.all_gather(whole_weight_view, weight, group=self.tp_group)
+        whole_weight = whole_weight.clone().requires_grad_()
+        logprobs_single_gpu = self.torch_linear_cross_entropy_single_gpu(
+            hidden, whole_weight, labels, reduction=reduction
+        )
+
+        # TP
+        logprobs_tp = self.TorchLinearCrossEntropy.apply(
+            hidden, weight, labels, self.tp_group, reduction
+        )
+        torch.testing.assert_close(logprobs_single_gpu, logprobs_tp)
+
+        # ------------ backward pass ------------ #
+        g_logprobs = torch.empty_like(logprobs_single_gpu).uniform_(-0.1, 0.1)
+        dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+        # single GPU
+        (d_hidden_single_gpu, d_weight_single_gpu) = torch.autograd.grad(
+            (logprobs_single_gpu,), (hidden, whole_weight), (g_logprobs,), retain_graph=False
+        )
+
+        # TP
+        (d_hidden_tp, d_weight_tp) = torch.autograd.grad(
+            (logprobs_tp,), (hidden, weight), (g_logprobs,), retain_graph=False
+        )
+        torch.testing.assert_close(d_hidden_single_gpu, d_hidden_tp, atol=1e-3, rtol=1e-3)
+        local_d_weight_single_gpu = d_weight_single_gpu[
+            self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], :
+        ]
+        torch.testing.assert_close(local_d_weight_single_gpu, d_weight_tp, atol=1e-3, rtol=1e-3)
+
+    @staticmethod
+    def get_problems():
+        return [
+            (80, 125, 64),
+            (80, 152064, 64),
+            (1024, 152064, 4096),
+            (4096, 152063, 8192),
+            ((1, 4096), 152064, 8192),
+            ((2, 4096), 152064, 8192),
+        ]
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+    @pytest.mark.parametrize("problem", get_problems())
+    def test_correctness(self, dtype, reduction, problem):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        hidden = (
+            torch.empty(hidden_shape, dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+        # ------ forward pass ------ #
+        dist.broadcast(hidden, src=0, group=self.tp_group)
+        dist.broadcast(labels, src=0, group=self.tp_group)
+
+        torch_logprobs = self.TorchLinearCrossEntropy.apply(
+            hidden.view(-1, dim), weight, labels, self.tp_group, reduction
+        )
+
+        custom_logprobs = linear_cross_entropy(
+            hidden, weight, labels, tp_group=self.tp_group, reduction=reduction
+        )
+
+        torch.testing.assert_close(torch_logprobs, custom_logprobs)
+
+        # ------- backward pass ------- #
+        g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+        dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+        (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+            (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+        )
+        (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+            (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+        )
+        torch.testing.assert_close(d_hidden_torch, d_hidden_custom, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(d_weight_torch, d_weight_custom, atol=1e-4, rtol=1e-4)
+
+    @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    def test_performance(self, problem, dtype, reduction):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        torch_fwd_latency = list()
+        torch_bwd_latency = list()
+        custom_fwd_latency = list()
+        custom_bwd_latency = list()
+
+        iterations = 5
+        for i in range(iterations):
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            # ------ forward pass ------ #
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            start_event.record()
+            torch_logprobs = self.TorchLinearCrossEntropy.apply(
+                hidden.view(-1, dim), weight, labels, self.tp_group, reduction
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels, tp_group=self.tp_group, reduction=reduction
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            # ------- backward pass ------- #
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            start_event.record()
+            (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_bwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_bwd_latency.append(start_event.elapsed_time(end_event))
+
+        # --- remove first latency due to warmup --- #
+        torch_fwd_latency = torch_fwd_latency[1:]
+        torch_bwd_latency = torch_bwd_latency[1:]
+        custom_fwd_latency = custom_fwd_latency[1:]
+        custom_bwd_latency = custom_bwd_latency[1:]
+
+        if self.is_chief:
+            print()
+            print(
+                f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:"
+            )
+            print(
+                f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms"
+            )
+
+    @pytest.mark.parametrize("problem", [((1, 4096), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    def test_storage(self, problem, dtype, reduction):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (num_tokens,) if isinstance(num_tokens, int) else num_tokens
+
+        if self.is_chief:
+            print()
+            print(
+                f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}:"
+            )
+
+        def torch_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            torch_logprobs = self.TorchLinearCrossEntropy.apply(
+                hidden.view(-1, dim), weight, labels, self.tp_group, reduction
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB"
+                )
+
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB"
+                )
+
+        def custom_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            custom_logprobs = linear_cross_entropy(
+                hidden, weight, labels, tp_group=self.tp_group, reduction=reduction
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB"
+                )
+
+            g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1)
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB"
+                )
+
+        self.cleanup()
+        torch_storage()
+        self.cleanup()
+        custom_storage()
+
+
+@pytest.mark.skipif(
+    "WORLD_SIZE" not in os.environ or int(os.environ["WORLD_SIZE"]) < 2,
+    reason="Requires torchrun with multiple GPUs",
+)
+@pytest.mark.skipif(get_device_arch_version() != 10, reason="Requires GPU architecture = 10")
+@pytest.mark.usefixtures("distributed_context")
+class TestFusedLinearCrossEntropySequenceParallel:
+    @pytest.fixture(autouse=True)
+    def setup_attrs(self, distributed_context):
+        """
+        Setup attributes for the test class.
+        """
+        self.tp_group = distributed_context.group
+        self.tp_rank = distributed_context.rank
+        self.tp_world_size = distributed_context.world_size
+        self.is_chief = distributed_context.is_chief
+
+    @staticmethod
+    def timed_barrier(timeout_s=10):
+        import time
+
+        work = torch.distributed.barrier(async_op=True)
+        t0 = time.time()
+        while not work.is_completed():
+            if time.time() - t0 > timeout_s:
+                exit(1)
+            time.sleep(0.05)
+        work.wait()
+
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        import gc
+
+        gc.collect()
+        torch.cuda.synchronize()
+
+    @staticmethod
+    def torch_linear_cross_entropy_single_gpu(
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        reduction: typing.Optional[str] = "mean",
+    ):
+        logits = hidden.to(torch.float32) @ weight.T.to(torch.float32)
+        logprobs = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.shape[-1]), labels.view(-1), reduction=reduction
+        )
+        return logprobs.to(torch.float32)
+
+    class TorchLinearCrossEntropy(torch.autograd.Function):
+        @staticmethod
+        def forward(
+            ctx,
+            hidden: torch.Tensor,
+            weight: torch.Tensor,
+            labels: torch.Tensor,
+            tp_group: torch.distributed.ProcessGroup,
+            reduction: typing.Optional[str] = "mean",
+        ):
+            tp_rank = 0 if tp_group is None else torch.distributed.get_rank(tp_group)
+            tp_world_size = 1 if tp_group is None else torch.distributed.get_world_size(tp_group)
+
+            whole_hidden = torch.empty(
+                (hidden.shape[0] * tp_world_size, hidden.shape[-1]),
+                dtype=hidden.dtype,
+                device=hidden.device,
+            )
+            dist.all_gather_into_tensor(whole_hidden, hidden, group=tp_group)
+
+            logits = whole_hidden.to(torch.float32) @ weight.T.to(torch.float32)
+
+            whole_logits = torch.empty(
+                (logits.shape[0], logits.shape[-1] * tp_world_size),
+                dtype=logits.dtype,
+                device=logits.device,
+            )
+            whole_logits_ref = [
+                whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
+                for i in range(tp_world_size)
+            ]
+            dist.all_gather(whole_logits_ref, logits, group=tp_group)
+
+            logprobs = torch.nn.functional.cross_entropy(
+                whole_logits.view(-1, whole_logits.shape[-1]), labels.view(-1), reduction=reduction
+            )
+
+            # If we don't preserve whole_logits,
+            # we need to re-compute it in the backward pass
+            ctx.save_for_backward(whole_hidden, weight, labels)
+            ctx.tp_group = tp_group
+            ctx.reduction = reduction
+            ctx.tp_rank = tp_rank
+            ctx.tp_world_size = tp_world_size
+
+            return logprobs.to(torch.float32)
+
+        @staticmethod
+        def backward(ctx, g_logprobs: torch.Tensor):
+            whole_hidden, weight, labels = ctx.saved_tensors
+            tp_group = ctx.tp_group
+            reduction = ctx.reduction
+            tp_rank = ctx.tp_rank
+            tp_world_size = ctx.tp_world_size
+
+            num_tokens, dim = whole_hidden.shape
+
+            if reduction == "mean":
+                _g_logprobs = torch.broadcast_to(g_logprobs / num_tokens, (num_tokens,))
+            elif reduction == "sum":
+                _g_logprobs = torch.broadcast_to(g_logprobs, (num_tokens,))
+            else:
+                _g_logprobs = g_logprobs
+
+            # re-compute whole_logits
+            logits = whole_hidden.to(torch.float32) @ weight.T.to(torch.float32)
+            whole_logits = torch.empty(
+                (logits.shape[0], logits.shape[-1] * tp_world_size),
+                dtype=logits.dtype,
+                device=logits.device,
+            )
+            whole_logits_ref = [
+                whole_logits[..., i * logits.shape[-1] : (i + 1) * logits.shape[-1]]
+                for i in range(tp_world_size)
+            ]
+            dist.all_gather(whole_logits_ref, logits, group=tp_group)
+
+            one_hot = torch.zeros_like(whole_logits)
+            one_hot.scatter_(1, labels.view(-1).unsqueeze(-1), 1)
+
+            pd = torch.nn.functional.softmax(whole_logits, dim=-1)
+            d_logits = (pd - one_hot) * _g_logprobs.unsqueeze(-1)
+            d_logits = d_logits.to(whole_hidden.dtype)
+
+            local_size = weight.size(0)
+            local_d_logits = d_logits[:, tp_rank * local_size : (tp_rank + 1) * local_size]
+
+            d_hidden = local_d_logits @ weight
+            local_d_weight = local_d_logits.T @ whole_hidden
+
+            # dist.all_reduce(
+            #     local_d_hidden,
+            #     op=dist.ReduceOp.SUM,
+            #     group=tp_group
+            # )
+
+            # split the local_d_hidden along the sequence length dimension
+            local_num_tokens = num_tokens // tp_world_size
+            # local_d_hidden = local_d_hidden[tp_rank * local_num_tokens : (tp_rank + 1) * local_num_tokens, :]
+
+            local_d_hidden = torch.empty(
+                (local_num_tokens, dim), dtype=weight.dtype, device=weight.device
+            )
+            dist.reduce_scatter_tensor(
+                local_d_hidden, d_hidden, op=dist.ReduceOp.SUM, group=tp_group
+            )
+            return local_d_hidden, local_d_weight, None, None, None
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+    @pytest.mark.parametrize("problem", [(256, 129280, 8192)])
+    def test_torch_sp_vs_single_gpu(self, dtype, reduction, problem):
+        num_tokens, vocabsize, dim = problem
+        vocabsize = vocabsize // self.tp_world_size
+
+        hidden = (
+            torch.empty((num_tokens, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(
+            0, vocabsize, (num_tokens * self.tp_world_size,), dtype=torch.long, device="cuda"
+        )
+
+        # ------------ forward pass ------------ #
+        dist.broadcast(labels, src=0, group=self.tp_group)
+
+        # single GPU
+        whole_hidden = torch.empty(
+            (num_tokens * self.tp_world_size, dim), dtype=dtype, device="cuda"
+        )
+        dist.all_gather_into_tensor(whole_hidden, hidden, group=self.tp_group)
+        whole_hidden = whole_hidden.clone().requires_grad_()
+
+        whole_weight = torch.empty(
+            (vocabsize * self.tp_world_size, dim), dtype=dtype, device="cuda"
+        )
+        whole_weight_view = [
+            whole_weight[i * vocabsize : (i + 1) * vocabsize, :] for i in range(self.tp_world_size)
+        ]
+        dist.all_gather(whole_weight_view, weight, group=self.tp_group)
+        whole_weight = whole_weight.clone().requires_grad_()
+        logprobs_single_gpu = self.torch_linear_cross_entropy_single_gpu(
+            whole_hidden, whole_weight, labels, reduction=reduction
+        )
+
+        # TP
+        logprobs_tp = self.TorchLinearCrossEntropy.apply(
+            hidden, weight, labels, self.tp_group, reduction
+        )
+        torch.testing.assert_close(logprobs_single_gpu, logprobs_tp)
+
+        # ------------ backward pass ------------ #
+        g_logprobs = torch.empty_like(logprobs_single_gpu).uniform_(-0.1, 0.1)
+        dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+        # single GPU
+        (d_hidden_single_gpu, d_weight_single_gpu) = torch.autograd.grad(
+            (logprobs_single_gpu,), (whole_hidden, whole_weight), (g_logprobs,), retain_graph=False
+        )
+
+        # TP
+        (d_hidden_tp, d_weight_tp) = torch.autograd.grad(
+            (logprobs_tp,), (hidden, weight), (g_logprobs,), retain_graph=False
+        )
+
+        local_d_hidden_single_gpu = d_hidden_single_gpu[
+            self.tp_rank * hidden.shape[0] : (self.tp_rank + 1) * hidden.shape[0], :
+        ]
+        torch.testing.assert_close(local_d_hidden_single_gpu, d_hidden_tp, atol=1e-3, rtol=1e-3)
+        local_d_weight_single_gpu = d_weight_single_gpu[
+            self.tp_rank * weight.shape[0] : (self.tp_rank + 1) * weight.shape[0], :
+        ]
+        torch.testing.assert_close(local_d_weight_single_gpu, d_weight_tp, atol=1e-3, rtol=1e-3)
+
+        self.cleanup()
+
+    @staticmethod
+    def get_problems():
+        return [
+            (80, 125, 64),
+            (80, 152064, 64),
+            (1024, 152064, 4096),
+            (4096, 15206, 1024),
+            ((1, 4096), 15206, 1024),
+            ((4, 1024), 15206, 1024),
+        ]
+
+    @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
+    @pytest.mark.parametrize("reduction", ["mean", "sum", "none"])
+    @pytest.mark.parametrize("problem", get_problems())
+    def test_correctness(self, dtype, reduction, problem):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (
+            (num_tokens * self.tp_world_size,)
+            if isinstance(num_tokens, int)
+            else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        )
+
+        hidden = (
+            torch.empty(hidden_shape, dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+            .uniform_(-0.1, 0.1)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+        # ------ forward pass ------ #
+        dist.broadcast(labels, src=0, group=self.tp_group)
+
+        torch_logprobs = self.TorchLinearCrossEntropy.apply(
+            hidden.view(-1, dim), weight, labels, self.tp_group, reduction
+        )
+
+        custom_logprobs = linear_cross_entropy(
+            hidden,
+            weight,
+            labels,
+            tp_group=self.tp_group,
+            reduction=reduction,
+            sequence_parallel=True,
+        )
+
+        torch.testing.assert_close(torch_logprobs, custom_logprobs)
+
+        # ------- backward pass ------- #
+        g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+        dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+        (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+            (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+        )
+        (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+            (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+        )
+
+        # in case one GPU failed, and leading to hang
+        torch.testing.assert_close(d_hidden_torch, d_hidden_custom, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(d_weight_torch, d_weight_custom, atol=1e-3, rtol=1e-3)
+        self.timed_barrier()
+
+        self.cleanup()
+
+    @pytest.mark.parametrize("problem", [((1, 1024), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    def test_performance(self, problem, dtype, reduction):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (
+            (num_tokens * self.tp_world_size,)
+            if isinstance(num_tokens, int)
+            else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        )
+
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+
+        torch_fwd_latency = list()
+        torch_bwd_latency = list()
+        custom_fwd_latency = list()
+        custom_bwd_latency = list()
+
+        iterations = 5
+        for i in range(iterations):
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            # ------ forward pass ------ #
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            start_event.record()
+            torch_logprobs = self.TorchLinearCrossEntropy.apply(
+                hidden.view(-1, dim), weight, labels, self.tp_group, reduction
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            custom_logprobs = linear_cross_entropy(
+                hidden,
+                weight,
+                labels,
+                tp_group=self.tp_group,
+                reduction=reduction,
+                sequence_parallel=True,
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_fwd_latency.append(start_event.elapsed_time(end_event))
+
+            # ------- backward pass ------- #
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            start_event.record()
+            (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_bwd_latency.append(start_event.elapsed_time(end_event))
+
+            start_event.record()
+            (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            custom_bwd_latency.append(start_event.elapsed_time(end_event))
+
+        # --- remove first latency due to warmup --- #
+        torch_fwd_latency = torch_fwd_latency[1:]
+        torch_bwd_latency = torch_bwd_latency[1:]
+        custom_fwd_latency = custom_fwd_latency[1:]
+        custom_bwd_latency = custom_bwd_latency[1:]
+
+        if self.is_chief:
+            print()
+            print(
+                f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:"
+            )
+            print(
+                f"[INFO]: Torch forward latency: {sum(torch_fwd_latency) / len(torch_fwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Custom forward latency: {sum(custom_fwd_latency) / len(custom_fwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Torch backward latency: {sum(torch_bwd_latency) / len(torch_bwd_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Custom backward latency: {sum(custom_bwd_latency) / len(custom_bwd_latency):.2f} ms"
+            )
+
+    @pytest.mark.parametrize("problem", [((1, 1024), 129280, 7168)])
+    @pytest.mark.parametrize("dtype", [torch.bfloat16])
+    @pytest.mark.parametrize("reduction", ["mean"])
+    def test_storage(self, problem, dtype, reduction):
+        num_tokens, vocabsize, dim = problem
+        hidden_shape = (num_tokens, dim) if isinstance(num_tokens, int) else (*num_tokens, dim)
+        labels_shape = (
+            (num_tokens * self.tp_world_size,)
+            if isinstance(num_tokens, int)
+            else (num_tokens[0] * self.tp_world_size, *num_tokens[1:])
+        )
+
+        if self.is_chief:
+            print()
+            print(
+                f"[INFO]: On problem {problem}, dtype {dtype}, reduction {reduction}, TP size {self.tp_world_size}, Sequence Parallel: True:"
+            )
+
+        def torch_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            torch_logprobs = self.TorchLinearCrossEntropy.apply(
+                hidden.view(-1, dim), weight, labels, self.tp_group, reduction
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Torch Forward pass peak memory: {torch_max_memory:.2f} MB"
+                )
+
+            g_logprobs = torch.empty_like(torch_logprobs).uniform_(-0.1, 0.1)
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            (d_hidden_torch, d_weight_torch) = torch.autograd.grad(
+                (torch_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            torch.cuda.synchronize()
+            torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Torch Backward pass peak memory: {torch_max_memory:.2f} MB"
+                )
+
+        def custom_storage():
+            hidden = (
+                torch.empty(hidden_shape, dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            weight = (
+                torch.empty((vocabsize, dim), dtype=dtype, device="cuda")
+                .uniform_(-0.1, 0.1)
+                .requires_grad_()
+            )
+            labels = torch.randint(0, vocabsize, labels_shape, dtype=torch.long, device="cuda")
+
+            dist.broadcast(hidden, src=0, group=self.tp_group)
+            dist.broadcast(labels, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            custom_logprobs = linear_cross_entropy(
+                hidden,
+                weight,
+                labels,
+                tp_group=self.tp_group,
+                reduction=reduction,
+                sequence_parallel=True,
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Custom Forward pass peak memory: {custom_max_memory:.2f} MB"
+                )
+
+            g_logprobs = torch.empty_like(custom_logprobs).uniform_(-0.1, 0.1)
+            dist.broadcast(g_logprobs, src=0, group=self.tp_group)
+
+            torch.cuda.reset_peak_memory_stats()
+            (d_hidden_custom, d_weight_custom) = torch.autograd.grad(
+                (custom_logprobs,), (hidden, weight), (g_logprobs,), retain_graph=False
+            )
+            torch.cuda.synchronize()
+            custom_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+            if self.is_chief:
+                print(
+                    f"[INFO]: On GPU {self.tp_rank}, Custom Backward pass peak memory: {custom_max_memory:.2f} MB"
+                )
+
+        self.cleanup()
+        torch_storage()
+        self.cleanup()
+        custom_storage()
diff --git a/tests/unit_tests/fusions/test_fused_mhc_kernels.py b/tests/unit_tests/fusions/test_fused_mhc_kernels.py
new file mode 100644
index 00000000000..15468df8264
--- /dev/null
+++ b/tests/unit_tests/fusions/test_fused_mhc_kernels.py
@@ -0,0 +1,564 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""Unit tests for fused mHC kernels (cuTile) and native implementations.
+
+Each test compares the fused kernel's forward output AND backward gradients
+against a pure-PyTorch differentiable reference to catch numerical drift
+introduced by kernel fusion.
+"""
+
+import math
+from typing import Optional
+
+import pytest
+import torch
+from torch import Tensor
+
+from megatron.core.fusions.fused_mhc_kernels import is_cutile_available
+from megatron.core.transformer.hyper_connection import (
+    native_h_aggregate,
+    native_h_post_bda,
+    native_proj_rms,
+    native_sinkhorn,
+)
+
+_require_cutile = pytest.mark.skipif(not is_cutile_available(), reason="cuTile not installed")
+
+
+@pytest.fixture(autouse=True)
+def _skip_without_cuda():
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+
+DTYPE = torch.bfloat16
+DEVICE = "cuda"
+FWD_ATOL, FWD_RTOL = 2e-2, 2e-2
+BWD_ATOL, BWD_RTOL = 5e-2, 5e-2
+RAND_LO, RAND_HI = -0.1, 0.1
+COSINE_SIM_THRESH = 0.999
+
+
+def _assert_cosine_similar(a: Tensor, b: Tensor, threshold: float, msg: str = ""):
+    """Assert that flattened tensors have cosine similarity >= threshold."""
+    a_flat = a.flatten().float()
+    b_flat = b.flatten().float()
+    sim = torch.nn.functional.cosine_similarity(a_flat.unsqueeze(0), b_flat.unsqueeze(0)).item()
+    assert sim >= threshold, (
+        f"{msg}: cosine similarity {sim:.6f} < {threshold} "
+        f"(max_abs_diff={torch.max(torch.abs(a_flat - b_flat)):.6e})"
+    )
+
+
+def _rand(*shape, **kwargs):
+    """Uniform in [RAND_LO, RAND_HI] to keep magnitudes small for bf16 stability."""
+    return torch.empty(*shape, dtype=DTYPE, device=DEVICE, **kwargs).uniform_(RAND_LO, RAND_HI)
+
+
+def _info():
+    backend = "cuTile" if is_cutile_available() else "native"
+    print(f"\n  [backend: {backend}]")
+
+
+# ============================================================================
+# Pure-PyTorch differentiable references (used by both fwd AND bwd tests)
+# ============================================================================
+
+
+def _ref_sinkhorn(logits: Tensor, num_iters: int, eps: float = 1e-6) -> Tensor:
+    row_max = logits.max(dim=-1, keepdim=True).values
+    M = torch.exp(logits - row_max)
+    for _ in range(num_iters):
+        M = M / M.sum(dim=-1, keepdim=True).clamp(min=eps)
+        M = M / M.sum(dim=-2, keepdim=True).clamp(min=eps)
+    return M
+
+
+def _ref_h_aggregate(x: Tensor, h_pre: Tensor) -> Tensor:
+    return (x * h_pre.unsqueeze(-1)).sum(dim=2)
+
+
+def _ref_h_post_bda(
+    h_res: Tensor, orig_res: Tensor, h_post: Tensor, x: Tensor, bias: Optional[Tensor]
+) -> Tensor:
+    s, b, n, C = orig_res.shape
+    mixed = torch.bmm(h_res.view(s * b, n, n), orig_res.view(s * b, n, C)).view(s, b, n, C)
+    x_exp = h_post.unsqueeze(-1) * x.unsqueeze(2)
+    out = x_exp + mixed
+    if bias is not None:
+        out = out + h_post.unsqueeze(-1) * bias.view(1, 1, 1, C)
+    return out
+
+
+def _ref_proj_rms(x: Tensor, weight: Tensor, eps: float = 1e-6):
+    proj = torch.matmul(x, weight.t())
+    norm = x.norm(dim=-1, keepdim=True)
+    K = x.shape[-1]
+    r = 1.0 / (norm / math.sqrt(K) + eps)
+    return proj, r
+
+
+# ============================================================================
+# Sinkhorn
+# ============================================================================
+
+
+class TestNativeSinkhorn:
+    """Tests for the native SinkhornKnopp implementation."""
+
+    @pytest.mark.parametrize("s,b,n,iters", [(2, 4, 4, 5), (1, 1, 2, 10)])
+    def test_fwd_bwd_vs_torch_reference(self, s, b, n, iters):
+        """native_sinkhorn fwd output and bwd grad must match the inline PyTorch reference."""
+        _info()
+        eps = 1e-6
+        data = _rand(s, b, n, n)
+        grad_out = _rand(s, b, n, n)
+
+        # -- native_sinkhorn path (autograd.Function) --
+        inp_f = data.clone().requires_grad_(True)
+        out_f = native_sinkhorn(inp_f, iters, eps)
+        out_f.backward(grad_out)
+        grad_f = inp_f.grad.clone()
+
+        # -- inline torch reference (fully differentiable) --
+        inp_r = data.clone().requires_grad_(True)
+        out_r = _ref_sinkhorn(inp_r, iters, eps)
+        out_r.backward(grad_out)
+        grad_r = inp_r.grad.clone()
+
+        torch.testing.assert_close(out_f, out_r, atol=FWD_ATOL, rtol=FWD_RTOL)
+        torch.testing.assert_close(grad_f, grad_r, atol=BWD_ATOL, rtol=BWD_RTOL)
+
+
+class TestFusedSinkhorn:
+    @_require_cutile
+    @pytest.mark.parametrize("s,b,n,iters", [(2, 4, 4, 5), (1, 1, 2, 10)])
+    def test_fwd_bwd_vs_reference(self, s, b, n, iters):
+        """E2E: fused cuTile fwd output and bwd grad must match the PyTorch reference."""
+        from megatron.core.fusions.fused_mhc_kernels import fused_sinkhorn
+
+        _info()
+        eps = 1e-6
+        data = _rand(s, b, n, n)
+        grad_out = _rand(s, b, n, n)
+
+        # -- fused path --
+        inp_f = data.clone().requires_grad_(True)
+        out_f = fused_sinkhorn(inp_f, iters, eps)
+        out_f.backward(grad_out)
+        grad_f = inp_f.grad.clone()
+
+        # -- reference path (fully differentiable) --
+        inp_r = data.clone().requires_grad_(True)
+        out_r = _ref_sinkhorn(inp_r, iters, eps)
+        out_r.backward(grad_out)
+        grad_r = inp_r.grad.clone()
+
+        torch.testing.assert_close(out_f, out_r, atol=FWD_ATOL, rtol=FWD_RTOL)
+        torch.testing.assert_close(grad_f, grad_r, atol=BWD_ATOL, rtol=BWD_RTOL)
+
+
+# ============================================================================
+# H_aggregate
+# ============================================================================
+
+
+class TestNativeHAggregate:
+    """Tests for native_h_aggregate."""
+
+    @pytest.mark.parametrize("s,b,n,C", [(2, 4, 4, 1024), (1, 1, 2, 256)])
+    def test_fwd_bwd_vs_torch_reference(self, s, b, n, C):
+        _info()
+        x_data = _rand(s, b, n, C)
+        h_data = _rand(s, b, n)
+        grad_out = _rand(s, b, C)
+
+        xf = x_data.clone().requires_grad_(True)
+        hf = h_data.clone().requires_grad_(True)
+        of = native_h_aggregate(xf, hf)
+        of.backward(grad_out)
+
+        xr = x_data.clone().requires_grad_(True)
+        hr = h_data.clone().requires_grad_(True)
+        oref = _ref_h_aggregate(xr, hr)
+        oref.backward(grad_out)
+
+        torch.testing.assert_close(of, oref, atol=FWD_ATOL, rtol=FWD_RTOL)
+        torch.testing.assert_close(xf.grad, xr.grad, atol=BWD_ATOL, rtol=BWD_RTOL)
+        torch.testing.assert_close(hf.grad, hr.grad, atol=BWD_ATOL, rtol=BWD_RTOL)
+
+
+class TestFusedHAggregate:
+    @_require_cutile
+    @pytest.mark.parametrize("s,b,n,C", [(2, 4, 4, 1024), (1, 1, 2, 256)])
+    def test_fwd_bwd_vs_reference(self, s, b, n, C):
+        """E2E: fused cuTile fwd output and bwd grads must match the PyTorch reference."""
+        from megatron.core.fusions.fused_mhc_kernels import fused_h_aggregate
+
+        _info()
+        x_data = _rand(s, b, n, C)
+        h_data = _rand(s, b, n)
+        grad_out = _rand(s, b, C)
+
+        # -- fused path --
+        xf = x_data.clone().requires_grad_(True)
+        hf = h_data.clone().requires_grad_(True)
+        of = fused_h_aggregate(xf, hf)
+        of.backward(grad_out)
+
+        # -- reference path --
+        xr = x_data.clone().requires_grad_(True)
+        hr = h_data.clone().requires_grad_(True)
+        oref = _ref_h_aggregate(xr, hr)
+        oref.backward(grad_out)
+
+        torch.testing.assert_close(of, oref, atol=FWD_ATOL, rtol=FWD_RTOL)
+        torch.testing.assert_close(xf.grad, xr.grad, atol=BWD_ATOL, rtol=BWD_RTOL)
+        torch.testing.assert_close(hf.grad, hr.grad, atol=BWD_ATOL, rtol=BWD_RTOL)
+
+
+# ============================================================================
+# H_post BDA
+# ============================================================================
+
+
+class TestNativeHPostBDA:
+    """Tests for native_h_post_bda."""
+
+    @pytest.mark.parametrize("with_bias", [True, False])
+    @pytest.mark.parametrize("s,b,n,C", [(2, 4, 4, 1024), (1, 2, 2, 256)])
+    def test_fwd_bwd_vs_torch_reference(self, s, b, n, C, with_bias):
+        _info()
+        hr_data = _rand(s, b, n, n)
+        orig_data = _rand(s, b, n, C)
+        hp_data = _rand(s, b, n)
+        x_data = _rand(s, b, C)
+        bias_data = _rand(C) if with_bias else None
+        grad_out = _rand(s, b, n, C)
+
+        def _make_inputs():
+            hr = hr_data.clone().requires_grad_(True)
+            orig = orig_data.clone().requires_grad_(True)
+            hp = hp_data.clone().requires_grad_(True)
+            x = x_data.clone().requires_grad_(True)
+            bi = bias_data.clone().requires_grad_(True) if with_bias else None
+            return hr, orig, hp, x, bi
+
+        hr_f, orig_f, hp_f, x_f, bi_f = _make_inputs()
+        out_f = native_h_post_bda(hr_f, orig_f, hp_f, x_f, bi_f)
+        out_f.backward(grad_out)
+
+        hr_r, orig_r, hp_r, x_r, bi_r = _make_inputs()
+        out_r = _ref_h_post_bda(hr_r, orig_r, hp_r, x_r, bi_r)
+        out_r.backward(grad_out)
+
+        torch.testing.assert_close(out_f, out_r, atol=FWD_ATOL, rtol=FWD_RTOL)
+        for name, gf, gr in [
+            ("h_res", hr_f.grad, hr_r.grad),
+            ("orig_res", orig_f.grad, orig_r.grad),
+            ("h_post", hp_f.grad, hp_r.grad),
+            ("x", x_f.grad, x_r.grad),
+        ]:
+            torch.testing.assert_close(
+                gf, gr, atol=BWD_ATOL, rtol=BWD_RTOL, msg=f"backward mismatch on {name}"
+            )
+        if with_bias:
+            torch.testing.assert_close(
+                bi_f.grad, bi_r.grad, atol=BWD_ATOL, rtol=BWD_RTOL, msg="backward mismatch on bias"
+            )
+
+
+class TestFusedHPostBDA:
+    @_require_cutile
+    @pytest.mark.parametrize("with_bias", [True, False])
+    @pytest.mark.parametrize("s,b,n,C", [(2, 4, 4, 1024), (1, 2, 2, 256)])
+    def test_fwd_bwd_vs_reference(self, s, b, n, C, with_bias):
+        """E2E: fused cuTile fwd output and bwd grads must match the PyTorch reference."""
+        from megatron.core.fusions.fused_mhc_kernels import fused_h_post_bda
+
+        _info()
+        hr_data = _rand(s, b, n, n)
+        orig_data = _rand(s, b, n, C)
+        hp_data = _rand(s, b, n)
+        x_data = _rand(s, b, C)
+        bias_data = _rand(C) if with_bias else None
+        grad_out = _rand(s, b, n, C)
+
+        def _make_inputs():
+            hr = hr_data.clone().requires_grad_(True)
+            orig = orig_data.clone().requires_grad_(True)
+            hp = hp_data.clone().requires_grad_(True)
+            x = x_data.clone().requires_grad_(True)
+            bi = bias_data.clone().requires_grad_(True) if with_bias else None
+            return hr, orig, hp, x, bi
+
+        # -- fused path --
+        hr_f, orig_f, hp_f, x_f, bi_f = _make_inputs()
+        out_f = fused_h_post_bda(hr_f, orig_f, hp_f, x_f, bi_f)
+        out_f.backward(grad_out)
+
+        # -- reference path --
+        hr_r, orig_r, hp_r, x_r, bi_r = _make_inputs()
+        out_r = _ref_h_post_bda(hr_r, orig_r, hp_r, x_r, bi_r)
+        out_r.backward(grad_out)
+
+        torch.testing.assert_close(out_f, out_r, atol=FWD_ATOL, rtol=FWD_RTOL)
+        for name, gf, gr in [
+            ("h_res", hr_f.grad, hr_r.grad),
+            ("orig_res", orig_f.grad, orig_r.grad),
+            ("h_post", hp_f.grad, hp_r.grad),
+            ("x", x_f.grad, x_r.grad),
+        ]:
+            torch.testing.assert_close(
+                gf, gr, atol=BWD_ATOL, rtol=BWD_RTOL, msg=f"backward mismatch on {name}"
+            )
+        if with_bias:
+            torch.testing.assert_close(
+                bi_f.grad, bi_r.grad, atol=BWD_ATOL, rtol=BWD_RTOL, msg="backward mismatch on bias"
+            )
+
+
+# ============================================================================
+# Proj RMS
+# ============================================================================
+
+
+class TestNativeProjRms:
+    """Tests for native_proj_rms."""
+
+    @pytest.mark.parametrize("M,N,K", [(256, 20, 4096), (64, 8, 512)])
+    def test_fwd_bwd_vs_torch_reference(self, M, N, K):
+        _info()
+        eps = 1e-6
+        x_data = _rand(M, K)
+        w_data = _rand(N, K)
+        grad_proj = _rand(M, N)
+        grad_r = _rand(M, 1)
+
+        xf = x_data.clone().requires_grad_(True)
+        wf = w_data.clone().requires_grad_(True)
+        proj_f, r_f = native_proj_rms(xf, wf, eps)
+        (proj_f * grad_proj + r_f * grad_r).sum().backward()
+
+        xr = x_data.clone().requires_grad_(True)
+        wr = w_data.clone().requires_grad_(True)
+        proj_r, r_r = _ref_proj_rms(xr, wr, eps)
+        (proj_r * grad_proj + r_r * grad_r).sum().backward()
+
+        torch.testing.assert_close(proj_f, proj_r, atol=FWD_ATOL, rtol=FWD_RTOL)
+        torch.testing.assert_close(r_f, r_r, atol=FWD_ATOL, rtol=FWD_RTOL)
+        torch.testing.assert_close(
+            xf.grad, xr.grad, atol=BWD_ATOL, rtol=BWD_RTOL, msg="backward mismatch on x"
+        )
+        torch.testing.assert_close(
+            wf.grad, wr.grad, atol=BWD_ATOL, rtol=BWD_RTOL, msg="backward mismatch on weight"
+        )
+
+
+class TestFusedProjRms:
+    @_require_cutile
+    @pytest.mark.parametrize("M,N,K", [(256, 20, 4096), (64, 8, 512)])
+    def test_fwd_bwd_vs_reference(self, M, N, K):
+        """E2E: fused cuTile fwd output and bwd grads must match the PyTorch reference."""
+        from megatron.core.fusions.fused_mhc_kernels import fused_proj_rms
+
+        _info()
+        eps = 1e-6
+        x_data = _rand(M, K)
+        w_data = _rand(N, K)
+        grad_proj = _rand(M, N)
+        grad_r = _rand(M, 1)
+
+        # -- fused path --
+        xf = x_data.clone().requires_grad_(True)
+        wf = w_data.clone().requires_grad_(True)
+        proj_f, r_f = fused_proj_rms(xf, wf, eps)
+        (proj_f * grad_proj + r_f * grad_r).sum().backward()
+
+        # -- reference path --
+        xr = x_data.clone().requires_grad_(True)
+        wr = w_data.clone().requires_grad_(True)
+        proj_r, r_r = _ref_proj_rms(xr, wr, eps)
+        (proj_r * grad_proj + r_r * grad_r).sum().backward()
+
+        torch.testing.assert_close(proj_f, proj_r, atol=FWD_ATOL, rtol=FWD_RTOL)
+        torch.testing.assert_close(r_f, r_r, atol=FWD_ATOL, rtol=FWD_RTOL)
+        torch.testing.assert_close(
+            xf.grad, xr.grad, atol=BWD_ATOL, rtol=BWD_RTOL, msg="backward mismatch on x"
+        )
+        torch.testing.assert_close(
+            wf.grad, wr.grad, atol=BWD_ATOL, rtol=BWD_RTOL, msg="backward mismatch on weight"
+        )
+
+
+# ============================================================================
+# End-to-end pipeline (all four kernels chained)
+# ============================================================================
+
+
+class TestEndToEndNative:
+    """Full mHC pipeline using native modules.
+
+    proj_rms -> compute_h -> sinkhorn -> aggregate -> h_post_bda.
+    Compares the native modules against inline PyTorch reference.
+    """
+
+    def test_full_pipeline_fwd_bwd(self):
+        _info()
+        s, b, n, C = 2, 4, 4, 1024
+        eps = 1e-6
+        sinkhorn_iters = 5
+
+        hs_data = _rand(s, b, n * C)
+        w_data = _rand(n * n + 2 * n, n * C)
+        layer_out_data = _rand(s, b, C)
+        layer_bias_data = _rand(C)
+
+        def _run_native_modules():
+            hs = hs_data.clone().requires_grad_(True)
+            w = w_data.clone().requires_grad_(True)
+
+            x_2d = hs.reshape(s * b, n * C)
+            proj, r = native_proj_rms(x_2d, w, eps)
+            proj = proj.view(s, b, -1)
+            r = r.view(s, b, 1)
+
+            h = r * proj
+            h_pre = h[..., :n].sigmoid()
+            h_post = h[..., n : 2 * n].sigmoid() * 2
+            h_res_logits = h[..., 2 * n :]
+            h_res = native_sinkhorn(h_res_logits.view(s, b, n, n), sinkhorn_iters, eps)
+
+            aggregated = native_h_aggregate(hs.view(s, b, n, C), h_pre)
+
+            output = native_h_post_bda(
+                h_res, hs.view(s, b, n, C), h_post, layer_out_data, layer_bias_data
+            )
+
+            loss = output.sum() + aggregated.sum()
+            loss.backward()
+            return output.detach(), aggregated.detach(), hs.grad.clone()
+
+        def _run_inline_ref():
+            hs = hs_data.clone().requires_grad_(True)
+            w = w_data.clone().requires_grad_(True)
+
+            x_2d = hs.reshape(s * b, n * C)
+            proj, r = _ref_proj_rms(x_2d, w, eps)
+            proj = proj.view(s, b, -1)
+            r = r.view(s, b, 1)
+
+            h = r * proj
+            h_pre = h[..., :n].sigmoid()
+            h_post = h[..., n : 2 * n].sigmoid() * 2
+            h_res_logits = h[..., 2 * n :]
+            h_res = _ref_sinkhorn(h_res_logits.view(s, b, n, n), sinkhorn_iters, eps)
+
+            aggregated = _ref_h_aggregate(hs.view(s, b, n, C), h_pre)
+
+            output = _ref_h_post_bda(
+                h_res, hs.view(s, b, n, C), h_post, layer_out_data, layer_bias_data
+            )
+
+            loss = output.sum() + aggregated.sum()
+            loss.backward()
+            return output.detach(), aggregated.detach(), hs.grad.clone()
+
+        out_m, agg_m, grad_m = _run_native_modules()
+        out_r, agg_r, grad_r = _run_inline_ref()
+
+        torch.testing.assert_close(
+            agg_m, agg_r, atol=FWD_ATOL, rtol=FWD_RTOL, msg="aggregated output mismatch"
+        )
+        torch.testing.assert_close(
+            out_m, out_r, atol=FWD_ATOL, rtol=FWD_RTOL, msg="h_post_bda output mismatch"
+        )
+        _assert_cosine_similar(
+            grad_m, grad_r, COSINE_SIM_THRESH, msg="hidden_states grad (E2E backward)"
+        )
+
+
+class TestEndToEndFused:
+    """Full mHC pipeline using fused cuTile kernels (requires cuTile)."""
+
+    @_require_cutile
+    def test_full_pipeline_fwd_bwd(self):
+        from megatron.core.fusions.fused_mhc_kernels import (
+            fused_h_aggregate,
+            fused_h_post_bda,
+            fused_proj_rms,
+            fused_sinkhorn,
+        )
+
+        _info()
+        s, b, n, C = 2, 4, 4, 1024
+        eps = 1e-6
+        sinkhorn_iters = 5
+
+        hs_data = _rand(s, b, n * C)
+        w_data = _rand(n * n + 2 * n, n * C)
+        layer_out_data = _rand(s, b, C)
+        layer_bias_data = _rand(C)
+
+        def _run_fused():
+            hs = hs_data.clone().requires_grad_(True)
+            w = w_data.clone().requires_grad_(True)
+
+            x_2d = hs.reshape(s * b, n * C)
+            proj, r = fused_proj_rms(x_2d, w, eps)
+            proj = proj.view(s, b, -1)
+            r = r.view(s, b, 1)
+
+            h = r * proj
+            h_pre = h[..., :n].sigmoid()
+            h_post = h[..., n : 2 * n].sigmoid() * 2
+            h_res_logits = h[..., 2 * n :]
+            h_res = fused_sinkhorn(h_res_logits.view(s, b, n, n), sinkhorn_iters, eps)
+
+            aggregated = fused_h_aggregate(hs.view(s, b, n, C), h_pre)
+
+            output = fused_h_post_bda(
+                h_res, hs.view(s, b, n, C), h_post, layer_out_data, layer_bias_data
+            )
+
+            loss = output.sum() + aggregated.sum()
+            loss.backward()
+            return output.detach(), aggregated.detach(), hs.grad.clone()
+
+        def _run_ref():
+            hs = hs_data.clone().requires_grad_(True)
+            w = w_data.clone().requires_grad_(True)
+
+            x_2d = hs.reshape(s * b, n * C)
+            proj, r = _ref_proj_rms(x_2d, w, eps)
+            proj = proj.view(s, b, -1)
+            r = r.view(s, b, 1)
+
+            h = r * proj
+            h_pre = h[..., :n].sigmoid()
+            h_post = h[..., n : 2 * n].sigmoid() * 2
+            h_res_logits = h[..., 2 * n :]
+            h_res = _ref_sinkhorn(h_res_logits.view(s, b, n, n), sinkhorn_iters, eps)
+
+            aggregated = _ref_h_aggregate(hs.view(s, b, n, C), h_pre)
+
+            output = _ref_h_post_bda(
+                h_res, hs.view(s, b, n, C), h_post, layer_out_data, layer_bias_data
+            )
+
+            loss = output.sum() + aggregated.sum()
+            loss.backward()
+            return output.detach(), aggregated.detach(), hs.grad.clone()
+
+        out_f, agg_f, grad_f = _run_fused()
+        out_r, agg_r, grad_r = _run_ref()
+
+        torch.testing.assert_close(
+            agg_f, agg_r, atol=FWD_ATOL, rtol=FWD_RTOL, msg="aggregated output mismatch"
+        )
+        torch.testing.assert_close(
+            out_f, out_r, atol=FWD_ATOL, rtol=FWD_RTOL, msg="h_post_bda output mismatch"
+        )
+        _assert_cosine_similar(
+            grad_f, grad_r, COSINE_SIM_THRESH, msg="hidden_states grad (E2E backward)"
+        )
diff --git a/tests/unit_tests/fusions/test_mla_yarn_rope_apply.py b/tests/unit_tests/fusions/test_mla_yarn_rope_apply.py
index 1c8976bfcb6..762195b5d7f 100644
--- a/tests/unit_tests/fusions/test_mla_yarn_rope_apply.py
+++ b/tests/unit_tests/fusions/test_mla_yarn_rope_apply.py
@@ -1,21 +1,27 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 
+import warnings
+from unittest.mock import MagicMock, patch
+
 import pytest
 import torch
 
 from megatron.core.models.common.embeddings import apply_rotary_pos_emb
+from megatron.core.models.common.embeddings import rope_utils as rope_utils_module
 from megatron.core.models.common.embeddings.yarn_rotary_pos_embedding import YarnRotaryEmbedding
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import is_torch_min_version
+from tests.unit_tests.test_utilities import Utils
 
 try:
     from megatron.core.fusions.fused_mla_yarn_rope_apply import (
-        fused_apply_mla_rope_for_kv,
-        fused_apply_mla_rope_for_q,
+        fused_mla_rope_inplace,
+        fused_mla_rope_kv_split,
     )
-except:
-    fused_apply_mla_rope_for_kv = None
-    fused_apply_mla_rope_for_q = None
+except Exception:
+    fused_mla_rope_inplace = None
+    fused_mla_rope_kv_split = None
 
 
 def dtype_tols(dtype):
@@ -37,8 +43,8 @@ def rank(self):
         return 0
 
 
-def _test_fused_apply_mla_rope_for_q(input_format):
-    assert fused_apply_mla_rope_for_q is not None
+def _test_fused_mla_rope_inplace(input_format, inverse=False, remove_interleaving=False):
+    assert fused_mla_rope_inplace is not None
     num_heads = 32
     q_dim = 128
     emb_dim = 64
@@ -91,13 +97,28 @@ def _test_fused_apply_mla_rope_for_q(input_format):
 
     no_pe, pe = torch.split(pytorch_fwd_input, [q_dim, emb_dim], dim=-1)
     pe_output = apply_rotary_pos_emb(
-        pe, freqs, transformer_config, cu_seqlens=cu_seqlens, mscale=mscale, cp_group=FakeCPGroup()
+        pe,
+        freqs,
+        transformer_config,
+        cu_seqlens=cu_seqlens,
+        mscale=mscale,
+        cp_group=FakeCPGroup(),
+        mla_rotary_interleaved=True,
+        inverse=inverse,
+        mla_output_remove_interleaving=remove_interleaving,
     )
     pytorch_output = torch.concat([no_pe, pe_output], dim=-1)
     pytorch_output.backward(pytorch_bwd_input, retain_graph=True)
 
-    fused_output = fused_apply_mla_rope_for_q(
-        fused_fwd_input, cos, sin, q_dim, emb_dim, cu_seqlens_q=cu_seqlens
+    fused_output = fused_mla_rope_inplace(
+        fused_fwd_input,
+        cos,
+        sin,
+        q_dim,
+        emb_dim,
+        cu_seqlens_q=cu_seqlens,
+        inverse=inverse,
+        remove_interleaving=remove_interleaving,
     )
     fused_output.backward(fused_bwd_input, retain_graph=True)
 
@@ -116,8 +137,8 @@ def _test_fused_apply_mla_rope_for_q(input_format):
     )
 
 
-def _test_fused_apply_mla_rope_for_kv(input_format):
-    assert fused_apply_mla_rope_for_kv is not None
+def _test_fused_mla_rope_kv_split(input_format, remove_interleaving=False):
+    assert fused_mla_rope_kv_split is not None
     num_heads = 32
     k_dim = 128
     v_dim = 128
@@ -190,6 +211,8 @@ def _test_fused_apply_mla_rope_for_kv(input_format):
         cu_seqlens=cu_seqlens,
         mscale=mscale,
         cp_group=FakeCPGroup(),
+        mla_rotary_interleaved=True,
+        mla_output_remove_interleaving=remove_interleaving,
     )
     if input_format == "sbhd":
         pe_output = pe_output.expand(-1, -1, num_heads, -1)
@@ -201,7 +224,7 @@ def _test_fused_apply_mla_rope_for_kv(input_format):
         (pytorch_k_output, pytorch_v_output), (pytorch_bwd_k_input, pytorch_bwd_v_input)
     )
 
-    fused_k_output, fused_v_output = fused_apply_mla_rope_for_kv(
+    fused_k_output, fused_v_output = fused_mla_rope_kv_split(
         fused_fwd_kv_input,
         fused_fwd_emb_input,
         cos,
@@ -210,6 +233,7 @@ def _test_fused_apply_mla_rope_for_kv(input_format):
         k_dim,
         v_dim,
         cu_seqlens_kv=cu_seqlens,
+        remove_interleaving=remove_interleaving,
     )
     torch.autograd.backward(
         (fused_k_output, fused_v_output), (fused_bwd_k_input, fused_bwd_v_input)
@@ -247,9 +271,70 @@ def _test_fused_apply_mla_rope_for_kv(input_format):
 @pytest.mark.skipif(not is_torch_min_version("2.5.0"), reason="Requires PyTorch >= 2.5.0")
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize("input_format", ["sbhd", "thd"])
-class TestFusedApplyMLARope:
-    def test_forward_backward_for_q(self, input_format):
-        _test_fused_apply_mla_rope_for_q(input_format)
+class TestFusedMLARope:
+    @pytest.mark.parametrize("inverse", [False, True])
+    @pytest.mark.parametrize("remove_interleaving", [False, True])
+    def test_inplace_forward_backward(self, input_format, inverse, remove_interleaving):
+        _test_fused_mla_rope_inplace(
+            input_format, inverse=inverse, remove_interleaving=remove_interleaving
+        )
+
+    @pytest.mark.parametrize("remove_interleaving", [False, True])
+    def test_kv_split_forward_backward(self, input_format, remove_interleaving):
+        _test_fused_mla_rope_kv_split(input_format, remove_interleaving=remove_interleaving)
+
+
+class TestApplyRotaryPosEmbMlaFusionConflict:
+    """Test apply_rotary_pos_emb: mla_rotary_interleaved vs apply_rope_fusion conflict."""
+
+    def setup_method(self):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+        self.seq_len = 16
+        self.num_heads = 2
+        self.kv_channels = 32
+        self.rot_dim = self.kv_channels
+
+    def teardown_method(self):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_mla_rotary_interleaved_with_apply_rope_fusion_emits_warning_and_uses_unfused(self):
+        """When apply_rope_fusion=True and mla_rotary_interleaved=True, expect warning and unfused path."""
+        config = TransformerConfig(
+            num_attention_heads=self.num_heads,
+            num_layers=1,
+            apply_rope_fusion=True,
+            rotary_interleaved=False,
+        )
+        t = torch.randn(
+            self.seq_len, 1, self.num_heads, self.kv_channels, device="cuda", dtype=torch.float32
+        )
+        freqs = torch.randn(self.seq_len, 1, 1, self.rot_dim, device="cuda", dtype=torch.float32)
 
-    def test_forward_backward_for_kv(self, input_format):
-        _test_fused_apply_mla_rope_for_kv(input_format)
+        fused_mock = MagicMock(return_value=t.clone())
+        with (
+            patch.object(rope_utils_module, "fused_apply_rotary_pos_emb", fused_mock),
+            patch.object(
+                rope_utils_module,
+                "_apply_rotary_pos_emb_bshd",
+                wraps=rope_utils_module._apply_rotary_pos_emb_bshd,
+            ) as unfused_spy,
+        ):
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                out = apply_rotary_pos_emb(t, freqs, config, mla_rotary_interleaved=True)
+            # Should have warned about MLA + fusion conflict
+            mla_fusion_warnings = [
+                x for x in w if "apply_rope_fusion does not support MLA-style" in str(x.message)
+            ]
+            assert (
+                len(mla_fusion_warnings) >= 1
+            ), "Expected warning when mla_rotary_interleaved and apply_rope_fusion both enabled"
+            # Fused kernel must not be used
+            fused_mock.assert_not_called()
+            # Unfused path must have been used
+            unfused_spy.assert_called_once()
+            call_kw = unfused_spy.call_args[1]
+            assert call_kw["mla_rotary_interleaved"] is True
+        assert out.shape == t.shape
diff --git a/tests/unit_tests/fusions/test_swiglu_fusion.py b/tests/unit_tests/fusions/test_swiglu_fusion.py
index c72679cd047..58e7069d3f1 100644
--- a/tests/unit_tests/fusions/test_swiglu_fusion.py
+++ b/tests/unit_tests/fusions/test_swiglu_fusion.py
@@ -1,5 +1,8 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import pytest
 import torch
+import torch.nn.functional as F
 
 from megatron.core.fusions.fused_bias_swiglu import bias_swiglu_impl, weighted_bias_swiglu_impl
 
@@ -39,3 +42,46 @@ def test_weighted_bias_swiglu(input_dtype):
     assert weights_2.grad.dtype == weights.grad.dtype
     if input_dtype == torch.float32:
         assert torch.allclose(weights.grad, weights_2.grad, **tols)
+
+
+@pytest.mark.parametrize("input_dtype", [torch.bfloat16, torch.float32])
+def test_clamped_weighted_bias_swiglu(input_dtype):
+    clamp_value = 10.0
+
+    if input_dtype == torch.float32:
+        tols = dict(rtol=1.0e-6, atol=1.0e-6)
+    elif input_dtype == torch.bfloat16:
+        tols = dict(rtol=2.0e-2, atol=1.0e-3)
+    else:
+        raise ValueError(f"Invalid input dtype: {input_dtype}")
+
+    x = torch.randn(16, 64, dtype=input_dtype, device="cuda")
+    x.requires_grad = True
+    weights = torch.randn(16, 1, dtype=torch.float32, device="cuda")
+    weights.requires_grad = True
+    bwd_input = torch.randn(16, 32, dtype=input_dtype, device="cuda")
+
+    # Reference: manual clamp + silu + weight
+    y_1, y_2 = torch.chunk(x, 2, -1)
+    y_1c = y_1.clamp(min=None, max=clamp_value)
+    y_2c = y_2.clamp(min=-clamp_value, max=clamp_value)
+    y = (F.silu(y_1c) * y_2c * weights).to(input_dtype)
+    y.backward(bwd_input)
+
+    x_2 = x.detach().clone()
+    x_2.requires_grad = True
+    weights_2 = weights.detach().clone()
+    weights_2.requires_grad = True
+    bwd_input_2 = bwd_input.detach().clone()
+
+    # Fused implementation
+    y_2_out = weighted_bias_swiglu_impl(x_2, None, weights_2, clamp_value=clamp_value)
+    y_2_out.backward(bwd_input_2)
+
+    assert y_2_out.dtype == y.dtype
+    assert torch.allclose(y, y_2_out, **tols)
+    assert x_2.grad.dtype == x.grad.dtype
+    assert torch.allclose(x.grad, x_2.grad, **tols)
+    assert weights_2.grad.dtype == weights.grad.dtype
+    if input_dtype == torch.float32:
+        assert torch.allclose(weights.grad, weights_2.grad, **tols)
diff --git a/tests/unit_tests/fusions/test_weighted_squared_relu_fusion.py b/tests/unit_tests/fusions/test_weighted_squared_relu_fusion.py
index 85755ac1de7..58907a39b7f 100644
--- a/tests/unit_tests/fusions/test_weighted_squared_relu_fusion.py
+++ b/tests/unit_tests/fusions/test_weighted_squared_relu_fusion.py
@@ -13,7 +13,7 @@
 def test_weighted_squared_relu_fusion(input_dtype):
     # Tolerances depend on dtype precision
     if input_dtype == torch.float32:
-        tols = dict(rtol=1.0e-6, atol=1.0e-6)
+        tols = dict(rtol=1.0e-5, atol=1.0e-5)
     elif input_dtype == torch.bfloat16:
         tols = dict(rtol=2.0e-2, atol=1.0e-3)
     else:
diff --git a/tests/unit_tests/models/test_experimental_attention_variant_module_specs.py b/tests/unit_tests/models/test_experimental_attention_variant_module_specs.py
new file mode 100644
index 00000000000..e3a589f1b97
--- /dev/null
+++ b/tests/unit_tests/models/test_experimental_attention_variant_module_specs.py
@@ -0,0 +1,660 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from megatron.core.transformer.enums import AttnMaskType, LayerType
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.spec_utils import ModuleSpec
+from megatron.core.transformer.transformer_block import TransformerBlockSubmodules
+from megatron.core.transformer.transformer_layer import (
+    HyperConnectionTransformerLayer,
+    TransformerLayer,
+)
+
+# ---------------------------------------------------------------------------
+# Helpers: fake backend and config builders
+# ---------------------------------------------------------------------------
+
+
+class _FakeLinear:
+    pass
+
+
+class _FakeColumnParallelLinear:
+    pass
+
+
+class _FakeRowParallelLinear:
+    pass
+
+
+class _FakeLayerNormColumnParallelLinear:
+    pass
+
+
+class _FakeLayerNorm:
+    pass
+
+
+class _FakeQKNorm:
+    pass
+
+
+class _FakeCoreAttention:
+    pass
+
+
+def _make_backend(fuse_layernorm=True):
+    """Return a mock BackendSpecProvider with deterministic return values."""
+    backend = MagicMock()
+    backend.linear.return_value = _FakeLinear
+    backend.column_parallel_linear.return_value = _FakeColumnParallelLinear
+    backend.row_parallel_linear.return_value = _FakeRowParallelLinear
+    backend.column_parallel_layer_norm_linear.return_value = _FakeLayerNormColumnParallelLinear
+    backend.fuse_layernorm_and_linear.return_value = fuse_layernorm
+    backend.core_attention.return_value = _FakeCoreAttention
+
+    def _layer_norm(rms_norm=False, for_qk=False):
+        return _FakeQKNorm if for_qk else _FakeLayerNorm
+
+    backend.layer_norm.side_effect = _layer_norm
+    return backend
+
+
+def _make_config(**overrides):
+    """Return a mock TransformerConfig with sane defaults."""
+    defaults = dict(
+        num_layers=4,
+        normalization="RMSNorm",
+        qk_layernorm=False,
+        multi_latent_attention=False,
+        qk_l2_norm=False,
+        transformer_impl="transformer_engine",
+        use_kitchen=False,
+        experimental_attention_variant=None,
+        linear_attention_freq=None,
+        moe_layer_freq=1,
+        num_moe_experts=None,
+        moe_grouped_gemm=False,
+        moe_use_legacy_grouped_gemm=False,
+        use_te_activation_func=False,
+        pipeline_model_parallel_size=1,
+        pipeline_model_parallel_layout=None,
+        use_kitchen_attention=False,
+        kitchen_attention_backend="sdpa",
+        fallback_to_eager_attn=False,
+        enable_hyper_connections=False,
+    )
+    defaults.update(overrides)
+    cfg = MagicMock()
+    for k, v in defaults.items():
+        setattr(cfg, k, v)
+    return cfg
+
+
+# ===================================================================
+# Tests for is_linear_attention_variant
+# ===================================================================
+
+
+class TestIsLinearAttentionVariant:
+    @staticmethod
+    def _fn(variant):
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            is_linear_attention_variant,
+        )
+
+        return is_linear_attention_variant(variant)
+
+    @pytest.mark.parametrize(
+        "variant, expected",
+        [("gated_delta_net", True), ("dsa", False), (None, False), ("some_unknown_variant", False)],
+    )
+    def test_variants(self, variant, expected):
+        """Validate linear-attention variant classification across supported and unsupported names."""
+        assert self._fn(variant) is expected
+
+
+# ===================================================================
+# Tests for get_moe_layer_pattern
+# ===================================================================
+
+
+class TestGetMoeLayerPattern:
+    @staticmethod
+    def _fn(config):
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_moe_layer_pattern,
+        )
+
+        return get_moe_layer_pattern(config)
+
+    @pytest.mark.parametrize(
+        "num_layers, freq, expected",
+        [(4, 1, [1, 1, 1, 1]), (6, 2, [1, 0, 1, 0, 1, 0]), (6, 3, [1, 0, 0, 1, 0, 0])],
+    )
+    def test_int_freq(self, num_layers, freq, expected):
+        """Verify integer moe_layer_freq is expanded into the expected per-layer MoE pattern."""
+        cfg = _make_config(num_layers=num_layers, moe_layer_freq=freq)
+        assert self._fn(cfg) == expected
+
+    def test_list_freq(self):
+        """Verify an explicit list pattern is used as-is."""
+        pattern = [1, 0, 1, 0]
+        cfg = _make_config(num_layers=4, moe_layer_freq=pattern)
+        assert self._fn(cfg) == pattern
+
+    def test_list_freq_wrong_length_raises(self):
+        """Verify a list with mismatched length fails fast."""
+        cfg = _make_config(num_layers=4, moe_layer_freq=[1, 0])
+        with pytest.raises(AssertionError, match="Invalid length"):
+            self._fn(cfg)
+
+    def test_invalid_type_raises(self):
+        """Verify unsupported moe_layer_freq types raise ValueError."""
+        cfg = _make_config(num_layers=4, moe_layer_freq="bad")
+        with pytest.raises(ValueError, match="Invalid moe_layer_freq"):
+            self._fn(cfg)
+
+
+# ===================================================================
+# Tests for get_linear_attention_pattern
+# ===================================================================
+
+
+class TestGetLinearAttentionPattern:
+    @staticmethod
+    def _fn(config):
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_linear_attention_pattern,
+        )
+
+        return get_linear_attention_pattern(config)
+
+    @pytest.mark.parametrize(
+        "num_layers, freq, expected",
+        [
+            # Every 4th layer (1-indexed) is SDPA (0), the rest are LA (1)
+            (8, 4, [1, 1, 1, 0, 1, 1, 1, 0]),
+            (4, 2, [1, 0, 1, 0]),
+            (3, 1, [0, 0, 0]),
+        ],
+    )
+    def test_int_freq(self, num_layers, freq, expected):
+        """Verify integer linear_attention_freq is expanded into the expected LA/SDPA pattern."""
+        cfg = _make_config(num_layers=num_layers, linear_attention_freq=freq)
+        assert self._fn(cfg) == expected
+
+    def test_list_freq(self):
+        """Verify an explicit linear-attention pattern list is used directly."""
+        pattern = [1, 0, 1, 0]
+        cfg = _make_config(num_layers=4, linear_attention_freq=pattern)
+        assert self._fn(cfg) == pattern
+
+    def test_list_freq_wrong_length_raises(self):
+        """Verify list length validation for linear_attention_freq."""
+        cfg = _make_config(num_layers=4, linear_attention_freq=[1, 0, 1])
+        with pytest.raises(AssertionError, match="Invalid length"):
+            self._fn(cfg)
+
+    def test_none_for_non_linear_variant(self):
+        """Verify non-linear variants default to all-standard attention when freq is None."""
+        cfg = _make_config(
+            num_layers=4, linear_attention_freq=None, experimental_attention_variant="dsa"
+        )
+        assert self._fn(cfg) == [0, 0, 0, 0]
+
+    def test_none_for_linear_variant_raises(self):
+        """Verify linear variants require linear_attention_freq to be explicitly set."""
+        cfg = _make_config(
+            num_layers=4,
+            linear_attention_freq=None,
+            experimental_attention_variant="gated_delta_net",
+        )
+        with pytest.raises(ValueError, match="linear_attention_freq is None"):
+            self._fn(cfg)
+
+    def test_invalid_type_raises(self):
+        """Verify unsupported linear_attention_freq types raise ValueError."""
+        cfg = _make_config(num_layers=4, linear_attention_freq=3.14)
+        with pytest.raises(ValueError, match="Invalid linear_attention_freq"):
+            self._fn(cfg)
+
+
+# ===================================================================
+# Tests for get_gated_delta_net_module_spec
+# ===================================================================
+
+
+class TestGetGatedDeltaNetModuleSpec:
+    def test_returns_correct_module_spec(self):
+        """Verify the top-level module spec targets GatedDeltaNet with expected metainfo."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_gated_delta_net_module_spec,
+        )
+        from megatron.core.ssm.gated_delta_net import GatedDeltaNet
+
+        backend = _make_backend()
+        cfg = _make_config(normalization="RMSNorm")
+        spec = get_gated_delta_net_module_spec(cfg, backend=backend)
+
+        assert isinstance(spec, ModuleSpec)
+        assert spec.module is GatedDeltaNet
+        assert spec.metainfo == {"fuse_input_layernorm": True}
+
+    def test_submodules_use_backend_modules(self):
+        """Verify backend-provided projection/norm modules are wired into submodules."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_gated_delta_net_module_spec,
+        )
+
+        backend = _make_backend()
+        cfg = _make_config(normalization="RMSNorm")
+        spec = get_gated_delta_net_module_spec(cfg, backend=backend)
+
+        subs = spec.submodules
+        assert subs.in_proj == _FakeLayerNormColumnParallelLinear
+        assert subs.out_proj == _FakeRowParallelLinear
+        backend.layer_norm.assert_any_call(rms_norm=True, for_qk=False)
+
+    def test_layer_norm_normalization(self):
+        """Verify LayerNorm mode passes rms_norm=False to backend.layer_norm."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_gated_delta_net_module_spec,
+        )
+
+        backend = _make_backend()
+        cfg = _make_config(normalization="LayerNorm")
+        get_gated_delta_net_module_spec(cfg, backend=backend)
+        backend.layer_norm.assert_any_call(rms_norm=False, for_qk=False)
+
+    def test_backend_auto_resolved_when_none(self):
+        """Verify backend is auto-resolved when caller does not pass one."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_gated_delta_net_module_spec,
+        )
+
+        cfg = _make_config(normalization="RMSNorm")
+        with patch(
+            "megatron.core.models.gpt.experimental_attention_variant_module_specs"
+            "._get_backend_spec_provider",
+            return_value=_make_backend(),
+        ):
+            spec = get_gated_delta_net_module_spec(cfg, backend=None)
+            assert isinstance(spec, ModuleSpec)
+
+
+# ===================================================================
+# Tests for get_dsa_module_spec_for_backend
+# ===================================================================
+
+
+class TestGetDsaModuleSpec:
+    def _call(self, cfg=None, backend=None):
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_dsa_module_spec_for_backend,
+        )
+
+        if cfg is None:
+            cfg = _make_config(multi_latent_attention=True, qk_l2_norm=False, qk_layernorm=True)
+        if backend is None:
+            backend = _make_backend()
+        return get_dsa_module_spec_for_backend(cfg, backend=backend)
+
+    def test_requires_multi_latent_attention(self):
+        """Verify DSA path rejects configs without MLA enabled."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_dsa_module_spec_for_backend,
+        )
+
+        cfg = _make_config(multi_latent_attention=False, qk_l2_norm=False)
+        with pytest.raises(AssertionError, match="only MLA supports"):
+            get_dsa_module_spec_for_backend(cfg, backend=_make_backend())
+
+    def test_rejects_qk_l2_norm(self):
+        """Verify unsupported qk_l2_norm setting is rejected for DSA+MLA."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_dsa_module_spec_for_backend,
+        )
+
+        cfg = _make_config(multi_latent_attention=True, qk_l2_norm=True)
+        with pytest.raises(AssertionError, match="qk_l2_norm is not supported"):
+            get_dsa_module_spec_for_backend(cfg, backend=_make_backend())
+
+    def test_returns_mla_self_attention_spec(self):
+        """Verify the returned attention module is MLA self-attention with causal mask."""
+        from megatron.core.transformer.multi_latent_attention import MLASelfAttention
+
+        spec = self._call()
+        assert spec.module is MLASelfAttention
+        assert spec.params == {"attn_mask_type": AttnMaskType.causal}
+        assert spec.metainfo == {"fuse_input_layernorm": False}
+
+    def test_core_attention_is_dsa(self):
+        """Verify MLA core_attention is wrapped with DSAttention."""
+        from megatron.core.transformer.experimental_attention_variant.dsa import DSAttention
+
+        spec = self._call()
+        core = spec.submodules.core_attention
+        assert core.module is DSAttention
+
+    def test_dsa_indexer_structure(self):
+        """Verify DSA indexer wiring uses expected backend linear/norm modules."""
+        from megatron.core.transformer.experimental_attention_variant.dsa import DSAIndexer
+
+        spec = self._call()
+        indexer = spec.submodules.core_attention.submodules.indexer
+        assert indexer.module is DSAIndexer
+        subs = indexer.submodules
+        assert subs.linear_wq_b == _FakeLinear
+        assert subs.linear_wk == _FakeLinear
+        assert subs.k_norm == _FakeQKNorm
+        assert subs.linear_weights_proj == _FakeLinear
+
+    @pytest.mark.parametrize("normalization", ["RMSNorm", "LayerNorm"])
+    def test_qk_layernorm_enabled(self, normalization):
+        """Verify q/kv layernorm uses backend.layer_norm(rms_norm=..., for_qk=True)."""
+        backend = _make_backend()
+        cfg = _make_config(
+            multi_latent_attention=True,
+            qk_l2_norm=False,
+            qk_layernorm=True,
+            normalization=normalization,
+        )
+        spec = self._call(cfg=cfg, backend=backend)
+        expected_rms = normalization == "RMSNorm"
+        assert spec.submodules.q_layernorm == _FakeQKNorm
+        assert spec.submodules.kv_layernorm == _FakeQKNorm
+        # Both point to the same qk_norm object
+        assert spec.submodules.q_layernorm is spec.submodules.kv_layernorm
+        backend.layer_norm.assert_any_call(rms_norm=expected_rms, for_qk=True)
+
+    def test_qk_layernorm_disabled(self):
+        """Verify q/kv layernorm becomes IdentityOp, skipping backend.layer_norm for qk."""
+        backend = _make_backend()
+        cfg = _make_config(multi_latent_attention=True, qk_l2_norm=False, qk_layernorm=False)
+        spec = self._call(cfg=cfg, backend=backend)
+        assert spec.submodules.q_layernorm is IdentityOp
+        assert spec.submodules.kv_layernorm is IdentityOp
+        # backend.layer_norm is still called for the indexer k_norm (for_qk=True at line 94),
+        # but NOT for the outer qk_norm (line 105-107 takes the else branch).
+        # Exactly one for_qk=True call should exist (from the indexer, not from qk_norm).
+        qk_calls = [c for c in backend.layer_norm.call_args_list if c.kwargs.get("for_qk")]
+        assert (
+            len(qk_calls) == 1
+        ), f"Expected 1 for_qk=True call (indexer only), got {len(qk_calls)}"
+
+    def test_linear_projections(self):
+        """Verify Q/KV projection slots and backend.column_parallel_linear call count."""
+        backend = _make_backend()
+        cfg = _make_config(multi_latent_attention=True, qk_l2_norm=False, qk_layernorm=True)
+        spec = self._call(cfg=cfg, backend=backend)
+        subs = spec.submodules
+        assert subs.linear_q_proj == _FakeColumnParallelLinear
+        assert subs.linear_q_down_proj == _FakeLinear
+        assert subs.linear_q_up_proj == _FakeColumnParallelLinear
+        assert subs.linear_kv_down_proj == _FakeLinear
+        assert subs.linear_kv_up_proj == _FakeColumnParallelLinear
+        assert subs.linear_proj == _FakeRowParallelLinear
+        # column_parallel_linear() is called exactly 3 times (q_proj, q_up_proj, kv_up_proj)
+        assert backend.column_parallel_linear.call_count == 3
+        assert backend.row_parallel_linear.call_count == 1
+
+
+# ===================================================================
+# Tests for get_experimental_attention_variant_module_spec
+# ===================================================================
+
+
+class TestGetExperimentalAttentionVariantModuleSpec:
+    MODULE = "megatron.core.models.gpt.experimental_attention_variant_module_specs"
+
+    @pytest.mark.parametrize(
+        "variant, target_fn",
+        [
+            ("gated_delta_net", "get_gated_delta_net_module_spec"),
+            ("dsa", "get_dsa_module_spec_for_backend"),
+        ],
+    )
+    def test_dispatches_to_variant_handler(self, variant, target_fn):
+        """Verify dispatcher routes each variant name to its corresponding builder function."""
+        backend = _make_backend()
+        cfg = _make_config(experimental_attention_variant=variant, normalization="RMSNorm")
+        with patch(f"{self.MODULE}.{target_fn}") as mock_fn:
+            mock_fn.return_value = ModuleSpec(module=MagicMock)
+            from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+                get_experimental_attention_variant_module_spec,
+            )
+
+            result = get_experimental_attention_variant_module_spec(cfg, backend=backend)
+            mock_fn.assert_called_once_with(config=cfg, backend=backend)
+            assert result is mock_fn.return_value
+
+    def test_invalid_variant_raises(self):
+        """Verify unknown variant names raise a clear ValueError."""
+        cfg = _make_config(experimental_attention_variant="unknown")
+        with pytest.raises(ValueError, match="Invalid experimental attention variant"):
+            from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+                get_experimental_attention_variant_module_spec,
+            )
+
+            get_experimental_attention_variant_module_spec(cfg, backend=_make_backend())
+
+
+# ===================================================================
+# Tests for get_transformer_layer_with_experimental_attention_variant_spec
+# ===================================================================
+
+
+class TestGetTransformerLayerWithExperimentalAttentionVariantSpec:
+    MODULE = "megatron.core.models.gpt.experimental_attention_variant_module_specs"
+
+    def _make_attention_spec(self, fuse_input_layernorm=True):
+        """Construct a mock attention spec with configurable fuse metadata."""
+        return ModuleSpec(module=MagicMock, metainfo={"fuse_input_layernorm": fuse_input_layernorm})
+
+    def _make_mlp_spec(self, fuse_pre_mlp_layernorm=True):
+        """Construct a mock MLP spec with configurable fuse metadata."""
+        return ModuleSpec(
+            module=MagicMock, metainfo={"fuse_pre_mlp_layernorm": fuse_pre_mlp_layernorm}
+        )
+
+    def test_all_experimental_no_moe(self):
+        """Verify all layers use experimental attention and dense MLP when no MoE is configured."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_transformer_layer_with_experimental_attention_variant_spec,
+        )
+
+        cfg = _make_config(
+            num_layers=4,
+            experimental_attention_variant="dsa",
+            num_moe_experts=None,
+            normalization="RMSNorm",
+        )
+        backend = _make_backend()
+        attn_spec = self._make_attention_spec(fuse_input_layernorm=False)
+        mlp_spec = self._make_mlp_spec(fuse_pre_mlp_layernorm=True)
+
+        with (
+            patch(
+                f"{self.MODULE}.get_experimental_attention_variant_module_spec",
+                return_value=attn_spec,
+            ),
+            patch(f"{self.MODULE}._get_dense_mlp_module_spec", return_value=mlp_spec),
+        ):
+            specs = get_transformer_layer_with_experimental_attention_variant_spec(
+                cfg, backend=backend
+            )
+
+        assert len(specs) == 4
+        for s in specs:
+            # Each layer should share the same selected module specs in this setup.
+            assert s.module is TransformerLayer
+            assert s.submodules.self_attention is attn_spec
+            assert s.submodules.mlp is mlp_spec
+
+    def test_hybrid_attention_pattern(self):
+        """Verify attention alternates between experimental and standard specs per pattern."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_transformer_layer_with_experimental_attention_variant_spec,
+        )
+
+        cfg = _make_config(
+            num_layers=4,
+            experimental_attention_variant="gated_delta_net",
+            linear_attention_freq=2,
+            num_moe_experts=None,
+            normalization="RMSNorm",
+        )
+        backend = _make_backend()
+        exp_attn_spec = self._make_attention_spec(fuse_input_layernorm=True)
+        std_attn_spec = self._make_attention_spec(fuse_input_layernorm=False)
+        mlp_spec = self._make_mlp_spec(fuse_pre_mlp_layernorm=True)
+
+        with (
+            patch(
+                f"{self.MODULE}.get_experimental_attention_variant_module_spec",
+                return_value=exp_attn_spec,
+            ),
+            patch(f"{self.MODULE}._get_self_attention_module_spec", return_value=std_attn_spec),
+            patch(f"{self.MODULE}._get_dense_mlp_module_spec", return_value=mlp_spec),
+        ):
+            specs = get_transformer_layer_with_experimental_attention_variant_spec(
+                cfg, backend=backend
+            )
+
+        assert len(specs) == 4
+        # Pattern for linear_attention_freq=2: [1, 0, 1, 0]
+        assert specs[0].submodules.self_attention is exp_attn_spec
+        assert specs[1].submodules.self_attention is std_attn_spec
+        assert specs[2].submodules.self_attention is exp_attn_spec
+        assert specs[3].submodules.self_attention is std_attn_spec
+
+    def test_hybrid_moe_pattern_with_mhc(self):
+        """Verify MLP alternates between MoE and dense specs per moe_layer_freq pattern."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_transformer_layer_with_experimental_attention_variant_spec,
+        )
+
+        cfg = _make_config(
+            num_layers=4,
+            experimental_attention_variant="dsa",
+            num_moe_experts=8,
+            moe_layer_freq=2,
+            normalization="RMSNorm",
+            enable_hyper_connections=True,
+        )
+        backend = _make_backend()
+        attn_spec = self._make_attention_spec(fuse_input_layernorm=False)
+        moe_spec = self._make_mlp_spec(fuse_pre_mlp_layernorm=False)
+        dense_spec = self._make_mlp_spec(fuse_pre_mlp_layernorm=True)
+
+        with (
+            patch(
+                f"{self.MODULE}.get_experimental_attention_variant_module_spec",
+                return_value=attn_spec,
+            ),
+            patch(f"{self.MODULE}._get_moe_module_spec", return_value=moe_spec),
+            patch(f"{self.MODULE}._get_dense_mlp_module_spec", return_value=dense_spec),
+        ):
+            specs = get_transformer_layer_with_experimental_attention_variant_spec(
+                cfg, backend=backend
+            )
+
+        # moe_layer_freq=2 -> [1, 0, 1, 0]
+        assert specs[0].submodules.mlp is moe_spec
+        assert specs[1].submodules.mlp is dense_spec
+        assert specs[2].submodules.mlp is moe_spec
+        assert specs[3].submodules.mlp is dense_spec
+        for s in specs:
+            assert s.module is HyperConnectionTransformerLayer
+
+
+# ===================================================================
+# Tests for get_transformer_block_with_experimental_attention_variant_spec
+# ===================================================================
+
+
+class TestGetTransformerBlockWithExperimentalAttentionVariantSpec:
+    MODULE = "megatron.core.models.gpt.experimental_attention_variant_module_specs"
+
+    @pytest.mark.parametrize(
+        "num_layers,pp_size,vp_stage,pp_rank,use_layout,offset,num_layers_to_build,layout_ids,expected_ids",
+        [
+            # no pipeline split
+            (4, 1, None, None, False, 0, 4, None, [0, 1, 2, 3]),
+            # pp split (rank 1 gets [4,5,6,7])
+            (8, 2, None, 1, False, 4, 4, None, [4, 5, 6, 7]),
+            # vpp + pp split (example stage)
+            (8, 2, 1, 0, False, 2, 2, None, [2, 3]),
+            # explicit pipeline layout wins over offset/num_layers
+            (8, 2, 0, 0, True, None, None, [0, 3, 5], [0, 3, 5]),
+        ],
+    )
+    def test_get_transformer_block_with_experimental_attention_variant_spec(
+        self,
+        num_layers,
+        pp_size,
+        vp_stage,
+        pp_rank,
+        use_layout,
+        offset,
+        num_layers_to_build,
+        layout_ids,
+        expected_ids,
+    ):
+        """Verify transformer block layer slicing and vp/pp argument forwarding."""
+        from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+            get_transformer_block_with_experimental_attention_variant_spec,
+        )
+
+        mock_layout = MagicMock() if use_layout else None
+        if mock_layout is not None:
+            # When layout is provided, it should fully control local layer selection.
+            mock_layout.get_layer_id_list.return_value = layout_ids
+
+        cfg = _make_config(
+            num_layers=num_layers,
+            pipeline_model_parallel_size=pp_size,
+            pipeline_model_parallel_layout=mock_layout,
+            normalization="RMSNorm",
+        )
+        backend = _make_backend()
+        fake_layer_specs = [
+            ModuleSpec(module=TransformerLayer, submodules=MagicMock()) for _ in range(num_layers)
+        ]
+
+        with (
+            patch(f"{self.MODULE}._get_backend_spec_provider", return_value=backend),
+            patch(
+                f"{self.MODULE}.get_transformer_layer_with_experimental_attention_variant_spec",
+                return_value=fake_layer_specs,
+            ),
+        ):
+            if use_layout:
+                result = get_transformer_block_with_experimental_attention_variant_spec(
+                    cfg, vp_stage=vp_stage, pp_rank=pp_rank
+                )
+                mock_layout.get_layer_id_list.assert_called_once_with(
+                    layer_type=LayerType.decoder, vp_stage=vp_stage, pp_rank=pp_rank
+                )
+            else:
+                # Without explicit layout, slicing comes from offset + num_layers_to_build.
+                with (
+                    patch(
+                        f"{self.MODULE}.get_transformer_layer_offset", return_value=offset
+                    ) as mock_offset,
+                    patch(
+                        f"{self.MODULE}.get_num_layers_to_build", return_value=num_layers_to_build
+                    ) as mock_num_layers,
+                ):
+                    result = get_transformer_block_with_experimental_attention_variant_spec(
+                        cfg, vp_stage=vp_stage, pp_rank=pp_rank
+                    )
+                mock_offset.assert_called_once_with(cfg, vp_stage=vp_stage, pp_rank=pp_rank)
+                mock_num_layers.assert_called_once_with(cfg, vp_stage=vp_stage, pp_rank=pp_rank)
+
+        assert isinstance(result, TransformerBlockSubmodules)
+        assert result.layer_specs == [fake_layer_specs[i] for i in expected_ids]
diff --git a/tests/unit_tests/models/test_gpt_layer_specs.py b/tests/unit_tests/models/test_gpt_layer_specs.py
new file mode 100644
index 00000000000..bfa86fd0241
--- /dev/null
+++ b/tests/unit_tests/models/test_gpt_layer_specs.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import pytest
+
+from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
+    get_gpt_layer_with_transformer_engine_spec,
+)
+from megatron.core.transformer.hyper_connection import HyperConnectionModule
+from megatron.core.transformer.identity_op import IdentityOp
+from megatron.core.transformer.transformer_layer import (
+    HyperConnectionTransformerLayer,
+    TransformerLayer,
+)
+
+_TE = get_gpt_layer_with_transformer_engine_spec
+_LOCAL = get_gpt_layer_local_spec
+_HC = HyperConnectionTransformerLayer
+_HC_MOD = HyperConnectionModule
+_TL = TransformerLayer
+_ID = IdentityOp
+
+
+class TestGptLayerSpecsHyperConnection:
+    """Test that enable_hyper_connection controls module types in layer specs."""
+
+    @pytest.mark.parametrize(
+        "factory,kwargs,expected_module,expected_hc",
+        [
+            (_TE, {}, _TL, _ID),
+            (_TE, {"enable_hyper_connection": True}, _HC, _HC_MOD),
+            (_TE, {"enable_hyper_connection": False}, _TL, _ID),
+            (_TE, {"multi_latent_attention": True, "enable_hyper_connection": False}, _TL, _ID),
+            (_TE, {"multi_latent_attention": True, "enable_hyper_connection": True}, _HC, _HC_MOD),
+            (_LOCAL, {}, _TL, _ID),
+            (_LOCAL, {"enable_hyper_connection": True}, _HC, _HC_MOD),
+            (_LOCAL, {"enable_hyper_connection": False}, _TL, _ID),
+            (_LOCAL, {"multi_latent_attention": True, "enable_hyper_connection": False}, _TL, _ID),
+            (
+                _LOCAL,
+                {"multi_latent_attention": True, "enable_hyper_connection": True},
+                _HC,
+                _HC_MOD,
+            ),
+            (_LOCAL, {"normalization": "RMSNorm", "enable_hyper_connection": False}, _TL, _ID),
+            (_LOCAL, {"normalization": "RMSNorm", "enable_hyper_connection": True}, _HC, _HC_MOD),
+        ],
+        ids=[
+            "te_default",
+            "te_enable",
+            "te_disable",
+            "te_mla_disable",
+            "te_mla_enable",
+            "local_default",
+            "local_enable",
+            "local_disable",
+            "local_mla_disable",
+            "local_mla_enable",
+            "local_rmsnorm_disable",
+            "local_rmsnorm_enable",
+        ],
+    )
+    def test_hyper_connection_spec(self, factory, kwargs, expected_module, expected_hc):
+        spec = factory(**kwargs)
+        assert spec.module is expected_module
+        assert spec.submodules.self_attention_hyper_connection is expected_hc
+        assert spec.submodules.mlp_hyper_connection is expected_hc
diff --git a/tests/unit_tests/models/test_hybrid_moe_model.py b/tests/unit_tests/models/test_hybrid_moe_model.py
index 01a46efe083..c1f10b0881a 100644
--- a/tests/unit_tests/models/test_hybrid_moe_model.py
+++ b/tests/unit_tests/models/test_hybrid_moe_model.py
@@ -16,6 +16,7 @@
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer import TransformerConfig
 from megatron.core.transformer.enums import AttnBackend
+from megatron.core.transformer.moe.moe_logging import destroy_moe_metrics_tracker
 from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args
 from megatron.training.global_vars import (
     destroy_global_vars,
@@ -32,6 +33,7 @@
     "activation_func": "megatron.core.activations.squared_relu",
     "activation_func_clamp_value": None,
     "activation_func_fp8_input_store": False,
+    "actual_vocab_size": 131072,
     "add_bias_linear": False,
     "add_qkv_bias": False,
     "apply_query_key_layer_scaling": False,
@@ -67,6 +69,10 @@
     "cpu_offloading_weights": False,
     "cross_entropy_fusion_impl": "native",
     "cross_entropy_loss_fusion": True,
+    "csa_compress_ratios": None,
+    "csa_compress_rotary_base": 40000.0,
+    "csa_dense_mode": False,
+    "csa_window_size": 128,
     "cuda_graph_impl": "none",
     "cuda_graph_retain_backward_graph": False,
     "cuda_graph_scope": [],
@@ -75,6 +81,7 @@
     "deallocate_pipeline_outputs": True,
     "defer_embedding_wgrad_compute": False,
     "delay_wgrad_compute": False,
+    "dense_grouped_gemm": False,
     "overlap_dispatch_backward_with_experts_wgrad": False,
     "deterministic_mode": False,
     "disable_bf16_reduced_precision_matmul": False,
@@ -89,6 +96,7 @@
     "embedding_init_method_std": 0.014,
     "enable_autocast": False,
     "enable_cuda_graph": False,
+    "enable_hyper_connections": False,
     "ep_overlap_early_attn_memory_release": False,
     "experimental_attention_variant": None,
     "expert_model_parallel_size": 4,
@@ -127,6 +135,7 @@
     "hidden_dropout": 0.0,
     "hidden_size": 2688,
     "hierarchical_context_parallel_sizes": None,
+    "high_priority_a2a_comm_stream": False,
     "inference_fuse_tp_communication": False,
     "inference_rng_tracker": False,
     "inference_sampling_seed": 42,
@@ -145,12 +154,16 @@
     "linear_num_value_heads": 32,
     "linear_value_head_dim": 128,
     "log_max_attention_logit": False,
+    "log_moe_overload_factor": False,
     "mamba_head_dim": 64,
     "mamba_num_groups": 8,
     "mamba_num_heads": 64,
     "mamba_state_dim": 128,
     "masked_softmax_fusion": True,
     "memory_efficient_layer_norm": False,
+    "mhc_init_gating_factor": 0.01,
+    "mhc_recompute_layer_num": None,
+    "mhc_sinkhorn_iterations": 20,
     "microbatch_group_size_per_vp_stage": 1,
     "mlp_chunks_for_prefill": 1,
     "moe_apply_probs_on_input": False,
@@ -158,18 +171,28 @@
     "moe_deepep_num_sms": 20,
     "moe_enable_deepep": False,
     "moe_expert_capacity_factor": None,
+    "moe_expert_rank_capacity_factor": None,
     "moe_ffn_hidden_size": 1856,
     "moe_flex_dispatcher_backend": "deepep",
     "moe_grouped_gemm": True,
-    "moe_hybridep_num_sms": 16,
+    "moe_hybridep_num_sms": None,
+    "moe_hybridep_num_sms_preprocessing": 108,
+    "moe_hybridep_num_blocks_permute": None,
+    "moe_hybridep_num_blocks_unpermute": None,
     "moe_input_jitter_eps": None,
     "moe_latent_size": None,
     "moe_layer_freq": 1,
     "moe_layer_recompute": False,
+    "moe_n_hash_layers": 0,
     "moe_pad_expert_input_to_capacity": False,
     "moe_pad_experts_for_cuda_graph_inference": False,
+    "moe_paged_stash": False,
+    "moe_paged_stash_buffer_size_factor_cpu": 0.0,
+    "moe_paged_stash_buffer_size_factor_cuda": 1.1,
+    "moe_paged_stash_page_size": 64,
     "moe_per_layer_logging": False,
     "moe_permute_fusion": False,
+    "moe_permute_fusion_into_hybridep": False,
     "moe_router_bias_update_rate": 0.001,
     "moe_router_dtype": "fp64",
     "moe_router_enable_expert_bias": True,
@@ -219,6 +242,7 @@
     "num_microbatches_with_partial_activation_checkpoints": None,
     "num_moe_experts": 128,
     "num_query_groups": 2,
+    "num_residual_streams": 4,
     "output_layer_init_method": {},
     "overlap_moe_expert_parallel_comm": False,
     "overlap_p2p_comm": False,
@@ -241,6 +265,7 @@
     "recompute_method": None,
     "recompute_modules": ["core_attn"],
     "recompute_num_layers": None,
+    "rotary_base_per_layer": None,
     "rotary_interleaved": False,
     "sequence_parallel": True,
     "softmax_scale": None,
@@ -265,6 +290,7 @@
     "tp_only_amax_red": False,
     "transformer_impl": "transformer_engine",
     "use_cpu_initialization": None,
+    "use_fused_mhc": False,
     "use_fused_weighted_squared_relu": False,
     "use_inference_optimized_layers": False,
     "use_kitchen": False,
@@ -282,12 +308,24 @@
     "fine_grained_activation_offloading": False,
     "min_offloaded_tensor_size": 1024 * 1024,
     "offload_modules": [],
+    "delay_offload_until_cuda_graph": False,
+    "delta_offload_bytes_across_pp_ranks": 0,
+    "activation_offload_fraction": 1.0,
+    "dynamic_context_parallel": False,
+    "min_dynamic_context_parallel_size": 1,
     "hybrid_context_parallel": False,
     "max_seqlen_per_dp_cp_rank": None,
+    "fallback_to_eager_attn": False,
     "inference_disable_triton_nvls_kernels": False,
-    "moe_router_force_biased": None,
     "inference_grouped_gemm_backend": "auto",
     "inference_moe_disable_fused_quant_kernels": False,
+    "linear_attention_type": None,
+    "moe_mlp_glu_interleave_size": None,
+    "moe_router_force_biased": None,
+    "sequence_packing_scheduler": None,
+    "use_transformer_engine_op_fuser": False,
+    "moe_single_grouped_weight": False,
+    "moe_single_grouped_bias": False,
 }
 # Fields to ignore entirely (ephemeral, environment-specific, very large).
 SKIP_FIELDS = set()
@@ -484,6 +522,7 @@ def create_test_args(self):
     def setup_method(self, method):
 
         os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+        destroy_moe_metrics_tracker()
         args = self.create_test_args()
         set_args(args)
 
diff --git a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py
index 2171e8db810..8b492dc23fe 100644
--- a/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py
+++ b/tests/unit_tests/pipeline_parallel/test_fine_grained_activation_offloading.py
@@ -23,6 +23,7 @@
 EPSILON = 0.30
 EPSILON_A2A = 0.30
 DELTA = 20  # MiB
+CUDA_GRAPH_DELTA = 200  # MiB — Considering some CG overhead
 
 
 def _reset_cuda_memory() -> None:
@@ -45,11 +46,17 @@ def _build_gpt_model(
     offload_modules: Optional[List[str]],
     min_offloaded_tensor_size: int,
     is_mla: bool,
+    enable_hyper_connections: bool = False,
+    num_residual_streams: int = 4,
+    mhc_recompute_layer_num: Optional[int] = None,
 ) -> GPTModel:
     """Build a GPTModel that uses TE-based transformer layer spec."""
     model_parallel_cuda_manual_seed(seed)
     torch.manual_seed(seed)
     ConfigClass = MLATransformerConfig if is_mla else TransformerConfig
+    recompute_modules = ["layernorm", "moe_act"] if num_experts is not None else ["layernorm"]
+    if enable_hyper_connections and mhc_recompute_layer_num is not None:
+        recompute_modules.append("mhc")
     transformer_config = ConfigClass(
         num_layers=num_layers,
         hidden_size=hidden_size,
@@ -57,8 +64,9 @@ def _build_gpt_model(
         use_cpu_initialization=True,
         attention_backend=AttnBackend.unfused,
         bf16=True,
+        params_dtype=torch.bfloat16,
         # Recompute
-        recompute_modules=["layernorm", "moe_act"] if num_experts is not None else ["layernorm"],
+        recompute_modules=recompute_modules,
         recompute_granularity="selective",
         # MoE
         num_moe_experts=num_experts,
@@ -67,6 +75,10 @@ def _build_gpt_model(
         fine_grained_activation_offloading=fine_grained_activation_offloading,
         offload_modules=offload_modules,
         min_offloaded_tensor_size=min_offloaded_tensor_size,
+        # Hyper Connection settings
+        enable_hyper_connections=enable_hyper_connections,
+        num_residual_streams=num_residual_streams,
+        mhc_recompute_layer_num=mhc_recompute_layer_num,
     )
     gpt_model = GPTModel(
         config=transformer_config,
@@ -74,6 +86,7 @@ def _build_gpt_model(
             num_experts=num_experts,
             moe_grouped_gemm=num_experts is not None,
             multi_latent_attention=is_mla,
+            enable_hyper_connection=enable_hyper_connections,
         ),
         vocab_size=vocab_size,
         max_sequence_length=seq_length,
@@ -319,7 +332,6 @@ def test_gpt_fine_grained_activation_offloading_correctness_and_memory(
         ("alltoall", True, ["mlp_norm"]),
         ("alltoall", False, ["expert_fc1"]),
         ("alltoall", False, ["moe_act"]),
-        ("alltoall", False, ["mlp_norm", "expert_fc1", "moe_act"]),
         (
             "alltoall",
             True,
@@ -418,6 +430,7 @@ def _build_overlap_moe_gpt(
             recompute_modules=["layernorm", "moe_act"],
             recompute_granularity="selective",
             bf16=True,
+            params_dtype=torch.bfloat16,
             # MoE + EP overlap
             num_moe_experts=num_experts,
             moe_grouped_gemm=True,
@@ -569,3 +582,690 @@ def _run_schedule_1f1b_two_microbatches(
                     )
     finally:
         Utils.destroy_model_parallel()
+
+
+# =============================================================================
+# CUDA Graph + Fine-grained Activation Offloading Tests
+# =============================================================================
+
+
+def _build_gpt_model_with_cuda_graph(
+    *,
+    seed: int,
+    num_layers: int,
+    hidden_size: int,
+    num_attention_heads: int,
+    vocab_size: int,
+    seq_length: int,
+    num_experts: Optional[int],
+    fine_grained_activation_offloading: bool,
+    offload_modules: Optional[List[str]],
+    min_offloaded_tensor_size: int,
+    is_mla: bool,
+    cuda_graph_impl: str,
+    cuda_graph_scope: Optional[List[str]],
+    cuda_graph_warmup_steps: int,
+    delay_offload_until_cuda_graph: bool = False,
+    activation_offload_fraction: float = 1.0,
+    enable_hyper_connections: bool = False,
+    num_residual_streams: int = 4,
+) -> GPTModel:
+    """Build a GPTModel with CUDA Graph support and fine-grained activation offloading."""
+    model_parallel_cuda_manual_seed(seed)
+    torch.manual_seed(seed)
+    ConfigClass = MLATransformerConfig if is_mla else TransformerConfig
+    recompute_modules = ["layernorm", "moe_act"] if num_experts is not None else ["layernorm"]
+    transformer_config = ConfigClass(
+        num_layers=num_layers,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        use_cpu_initialization=True,
+        attention_backend=AttnBackend.unfused,
+        bf16=True,
+        params_dtype=torch.bfloat16,
+        # Recompute (note: "mhc" recompute is incompatible with offloading)
+        recompute_modules=recompute_modules,
+        recompute_granularity="selective",
+        # MoE
+        num_moe_experts=num_experts,
+        moe_grouped_gemm=(num_experts is not None),
+        # Fine-grained activation offloading
+        fine_grained_activation_offloading=fine_grained_activation_offloading,
+        offload_modules=offload_modules,
+        min_offloaded_tensor_size=min_offloaded_tensor_size,
+        delay_offload_until_cuda_graph=delay_offload_until_cuda_graph,
+        activation_offload_fraction=activation_offload_fraction,
+        # CUDA Graph settings
+        cuda_graph_impl=cuda_graph_impl,
+        cuda_graph_scope=cuda_graph_scope,
+        cuda_graph_warmup_steps=cuda_graph_warmup_steps,
+        use_te_rng_tracker=True,
+        # Hyper Connection settings
+        enable_hyper_connections=enable_hyper_connections,
+        num_residual_streams=num_residual_streams,
+    )
+    gpt_model = GPTModel(
+        config=transformer_config,
+        transformer_layer_spec=get_gpt_layer_with_transformer_engine_spec(
+            num_experts=num_experts,
+            moe_grouped_gemm=num_experts is not None,
+            multi_latent_attention=is_mla,
+            enable_hyper_connection=enable_hyper_connections,
+        ),
+        vocab_size=vocab_size,
+        max_sequence_length=seq_length,
+    ).bfloat16()
+    return gpt_model
+
+
+def _run_iters_with_cuda_graph(
+    model: GPTModel,
+    *,
+    input_ids: torch.Tensor,
+    position_ids: torch.Tensor,
+    attention_mask: torch.Tensor,
+    num_warmup_iters: int,
+    num_measure_iters: int,
+    enable_offload_reset: bool,
+) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], int]:
+    """
+    Run multiple forward+backward iterations with CUDA graph capture.
+
+    Returns:
+      - logits from last iteration (CPU float32)
+      - selected grads from last iteration (CPU float32)
+      - peak_memory_allocated (bytes) during measurement iterations
+    """
+    from megatron.core.num_microbatches_calculator import (
+        destroy_num_microbatches_calculator,
+        init_num_microbatches_calculator,
+    )
+    from megatron.core.transformer.cuda_graphs import TECudaGraphHelper
+
+    # Switch to a non-default stream so AccumulateGrad nodes (created during warmup backward
+    # passes) are associated with this stream rather than the default stream.  If they are on
+    # the default stream, CUDA graph capture will fail with cudaErrorStreamCaptureImplicit.
+    te_side_stream = torch.cuda.Stream()
+    te_side_stream.wait_stream(torch.cuda.current_stream())
+    torch.cuda.set_stream(te_side_stream)
+
+    micro_batch_size = input_ids.shape[0]
+    init_num_microbatches_calculator(
+        rank=0,
+        rampup_batch_size=None,
+        global_batch_size=micro_batch_size,
+        micro_batch_size=micro_batch_size,
+        data_parallel_size=1,
+        decrease_batch_size_if_needed=False,
+    )
+
+    if enable_offload_reset:
+        off_interface.reset()
+
+    # Warmup iterations (before CUDA graph capture)
+    for _ in range(num_warmup_iters):
+        if enable_offload_reset:
+            off_interface.reset()
+        logits = model(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        loss = logits.float().sum()
+        loss.backward()
+        # Zero grads for next iteration
+        for p in model.parameters():
+            if p.grad is not None:
+                p.grad.zero_()
+
+    # Trigger post-warmup offload decisions
+    if enable_offload_reset:
+        off_interface.reset()
+
+    # TECudaGraphHelper expects model chunks to have zero_grad_buffer() (from DDP wrappers).
+    # For a plain GPTModel, this is a no-op (same as the DataParallelBase base implementation).
+    if not hasattr(model, 'zero_grad_buffer'):
+        model.zero_grad_buffer = lambda: None
+
+    # Create CUDA graphs after warmup
+    cuda_graph_helper = TECudaGraphHelper(
+        model=[model],
+        config=model.config,
+        seq_length=input_ids.shape[1],
+        micro_batch_size=input_ids.shape[0],
+        optimizers=[],
+    )
+    cuda_graph_helper.create_cudagraphs()
+
+    # Measurement iterations (with CUDA graph replay)
+    torch.cuda.reset_peak_memory_stats()
+    for i in range(num_measure_iters):
+        if enable_offload_reset:
+            off_interface.reset()
+        logits = model(
+            input_ids=input_ids, position_ids=position_ids, attention_mask=attention_mask
+        )
+        loss = logits.float().sum()
+        loss.backward()
+        if i < num_measure_iters - 1:
+            for p in model.parameters():
+                if p.grad is not None:
+                    p.grad.zero_()
+
+    torch.cuda.synchronize()
+    peak_bytes = int(torch.cuda.max_memory_allocated())
+
+    # Capture grads from last iteration
+    grads: Dict[str, torch.Tensor] = {}
+    for name, p in model.named_parameters():
+        grads[name] = p.grad.detach().float().cpu() if p.grad is not None else None
+
+    # Cleanup CUDA graphs
+    if cuda_graph_helper.graphs_created():
+        cuda_graph_helper.delete_cuda_graphs()
+
+    destroy_num_microbatches_calculator()
+
+    return logits.detach().float().cpu(), grads, peak_bytes
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for offloading tests.")
+@pytest.mark.skipif(
+    not is_te_min_version("2.14.0"), reason="CUDA Graph with TE RNG tracker requires TE >= 2.13.0"
+)
+@pytest.mark.flaky_in_dev
+@pytest.mark.parametrize(
+    "is_mla, offload_modules, cuda_graph_scope, activation_offload_fraction, delay_offload",
+    [
+        # MoE model with attention CUDA graph + attn offloading
+        (False, ["core_attn", "attn_proj"], ["attn", "moe_router"], 1.0, True),
+        (False, ["expert_fc1", "moe_act"], ["attn", "moe_router", "moe_preprocess"], 1.0, True),
+        (False, ["core_attn", "attn_proj", "expert_fc1"], ["attn", "moe_router"], 1.0, True),
+        (
+            False,
+            ["core_attn", "attn_proj", "expert_fc1", "moe_act"],
+            ["attn", "moe_router"],
+            1.0,
+            True,
+        ),
+        (
+            False,
+            ["core_attn", "expert_fc1", "moe_act"],
+            ["attn", "moe_router", "moe_preprocess"],
+            1.0,
+            True,
+        ),
+        (
+            True,
+            ["core_attn", "attn_proj", "expert_fc1", "moe_act"],
+            ["attn", "moe_router", "moe_preprocess"],
+            1.0,
+            True,
+        ),
+        # Test activation_offload_fraction parameter
+        (False, ["core_attn", "attn_proj", "expert_fc1"], ["attn", "moe_router"], 0.0, True),
+        (False, ["core_attn", "attn_proj", "expert_fc1"], ["attn", "moe_router"], 0.5, True),
+        # Test delay_offload_until_cuda_graph parameter
+        (False, ["core_attn", "attn_proj", "expert_fc1"], ["attn", "moe_router"], 1.0, False),
+    ],
+)
+def test_fine_grained_activation_offloading_with_cuda_graph(
+    is_mla: bool,
+    offload_modules: List[str],
+    cuda_graph_scope: List[str],
+    activation_offload_fraction: float,
+    delay_offload: bool,
+):
+    """
+    Test fine-grained activation offloading combined with CUDA graph capture.
+
+    Verifies:
+    - Forward output correctness with CUDA graph + offloading
+    - Backward gradient correctness
+    - Memory savings from offloading are preserved with CUDA graphs
+    - Different activation_offload_fraction values work correctly
+    - Both delay_offload_until_cuda_graph=True/False produce correct results
+    """
+    from megatron.core.tensor_parallel.random import initialize_rng_tracker
+
+    os.environ.pop("NVTE_FUSED_ATTN", None)
+    os.environ.pop("NVTE_FLASH_ATTN", None)
+    os.environ.pop("NVTE_UNFUSED_ATTN", None)
+
+    initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True)
+    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+
+    seed = 123
+    num_experts = 4  # Always MoE model
+    num_layers = 4  # Smaller for faster test with CUDA graphs
+    hidden_size = 1024
+    num_attention_heads = 8
+    vocab_size = 512
+    seq_length = 512
+    micro_batch_size = 2
+    device = torch.device("cuda")
+    cuda_graph_warmup_steps = 3
+
+    input_ids, position_ids, attention_mask = _make_gpt_inputs(
+        seq_length=seq_length, micro_batch_size=micro_batch_size, device=device
+    )
+
+    off_interface.reset_instance()
+
+    try:
+        # 1) Baseline: CUDA graph enabled, offloading disabled
+        _reset_cuda_memory()
+        base_model = _build_gpt_model_with_cuda_graph(
+            seed=seed,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            vocab_size=vocab_size,
+            seq_length=seq_length,
+            num_experts=num_experts,
+            fine_grained_activation_offloading=False,
+            offload_modules=None,
+            min_offloaded_tensor_size=1024 * 1024,
+            is_mla=is_mla,
+            cuda_graph_impl="transformer_engine",
+            cuda_graph_scope=cuda_graph_scope,
+            cuda_graph_warmup_steps=cuda_graph_warmup_steps,
+        ).cuda()
+        base_model.train()
+
+        base_logits, base_grads, base_peak = _run_iters_with_cuda_graph(
+            base_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            num_warmup_iters=cuda_graph_warmup_steps,
+            num_measure_iters=2,
+            enable_offload_reset=False,
+        )
+        del base_model
+        _reset_cuda_memory()
+
+        # 2) Test: CUDA graph enabled + offloading enabled
+        off_interface.reset_instance()
+
+        off_model = _build_gpt_model_with_cuda_graph(
+            seed=seed,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            vocab_size=vocab_size,
+            seq_length=seq_length,
+            num_experts=num_experts,
+            fine_grained_activation_offloading=True,
+            offload_modules=offload_modules,
+            min_offloaded_tensor_size=1024,  # Force offloading for determinism
+            is_mla=is_mla,
+            cuda_graph_impl="transformer_engine",
+            cuda_graph_scope=cuda_graph_scope,
+            cuda_graph_warmup_steps=cuda_graph_warmup_steps,
+            delay_offload_until_cuda_graph=delay_offload,
+            activation_offload_fraction=activation_offload_fraction,
+        ).cuda()
+        off_model.train()
+
+        off_logits, off_grads, off_peak = _run_iters_with_cuda_graph(
+            off_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            num_warmup_iters=cuda_graph_warmup_steps,
+            num_measure_iters=2,
+            enable_offload_reset=True,
+        )
+        del off_model
+        _reset_cuda_memory()
+
+        # 3) Correctness checks
+        assert torch.allclose(
+            off_logits, base_logits, rtol=1e-2, atol=1e-2
+        ), f"Logits mismatch: max_diff={torch.max(torch.abs(off_logits - base_logits))}"
+        assert set(off_grads.keys()) == set(base_grads.keys())
+        for name, gb in base_grads.items():
+            go = off_grads[name]
+            if gb is None or go is None:
+                assert gb is None and go is None, f"Grad None mismatch for {name}"
+                continue
+            assert torch.allclose(
+                go, gb, rtol=1e-2, atol=1e-2
+            ), f"Grad mismatch for {name}: max_diff={torch.max(torch.abs(go - gb))}"
+
+        # 4) Memory checks - offloading should still reduce memory with CUDA graphs
+        saved_mib = (base_peak - off_peak) / (1024**2)
+        print(
+            f"CUDA Graph + Offload test (fraction={activation_offload_fraction}, delay={delay_offload}): "
+            f"base_peak={base_peak/(1024**2):.2f}MiB, "
+            f"off_peak={off_peak/(1024**2):.2f}MiB, "
+            f"saved={saved_mib:.2f}MiB"
+        )
+
+        # Basic sanity checks
+        assert not torch.isnan(off_logits).any(), "NaN detected in logits"
+        assert not torch.isinf(off_logits).any(), "Inf detected in logits"
+
+        # Check gradients are valid
+        for name, g in off_grads.items():
+            if g is not None:
+                assert not torch.isnan(g).any(), f"NaN detected in grad for {name}"
+                assert not torch.isinf(g).any(), f"Inf detected in grad for {name}"
+
+        # Note: With CUDA graphs, memory behavior may differ from eager mode.
+        # We check that offloading doesn't significantly increase memory.
+        # In some cases, graph capture overhead may offset offload savings.
+        assert saved_mib >= -CUDA_GRAPH_DELTA, (
+            f"Offloading with CUDA graph significantly increased memory: "
+            f"saved={saved_mib:.2f}MiB (negative means increase)"
+        )
+
+    finally:
+        Utils.destroy_model_parallel()
+
+
+@pytest.mark.flaky_in_dev
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for offloading tests.")
+@pytest.mark.skipif(
+    not is_te_min_version("2.14.0"), reason="CUDA Graph with TE RNG tracker requires TE >= 2.13.0"
+)
+@pytest.mark.parametrize(
+    "offload_modules, cuda_graph_scope, delay_offload",
+    [
+        # mHC + MoE with attention CUDA graph + attn offloading
+        (["core_attn", "attn_proj"], ["attn", "moe_router"], True),
+        # mHC + MoE with expert offloading
+        (["expert_fc1", "moe_act"], ["attn", "moe_router", "moe_preprocess"], True),
+        # mHC + MoE with combined offloading
+        (["core_attn", "attn_proj", "expert_fc1", "moe_act"], ["attn", "moe_router"], True),
+        # mHC + delay_offload_until_cuda_graph=False
+        (["core_attn", "attn_proj", "expert_fc1"], ["attn", "moe_router"], False),
+    ],
+)
+def test_mhc_fine_grained_activation_offloading_with_cuda_graph(
+    offload_modules: List[str], cuda_graph_scope: List[str], delay_offload: bool
+):
+    """
+    Test mHC (Hyper Connection) + fine-grained activation offloading + CUDA graph.
+
+    This validates that the fix to HyperConnectionTransformerLayer._te_cuda_graph_replay_impl
+    correctly preserves the delay_offload_until_cuda_graph lifecycle (enter_replay /
+    flush_delayed_groups / exit_replay) from the parent class.
+
+    Note: "mhc" recompute is incompatible with fine_grained_activation_offloading,
+    so mhc recompute is NOT enabled here. HC still functions, just without the mHC-specific
+    selective recompute optimization.
+    """
+    from megatron.core.tensor_parallel.random import initialize_rng_tracker
+
+    os.environ.pop("NVTE_FUSED_ATTN", None)
+    os.environ.pop("NVTE_FLASH_ATTN", None)
+    os.environ.pop("NVTE_UNFUSED_ATTN", None)
+
+    initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True)
+    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+
+    seed = 123
+    num_experts = 4
+    num_layers = 4
+    hidden_size = 1024
+    num_attention_heads = 8
+    vocab_size = 512
+    seq_length = 512
+    micro_batch_size = 2
+    device = torch.device("cuda")
+    cuda_graph_warmup_steps = 3
+
+    input_ids, position_ids, attention_mask = _make_gpt_inputs(
+        seq_length=seq_length, micro_batch_size=micro_batch_size, device=device
+    )
+
+    off_interface.reset_instance()
+
+    try:
+        # 1) Baseline: mHC + CUDA graph enabled, offloading disabled
+        _reset_cuda_memory()
+        base_model = _build_gpt_model_with_cuda_graph(
+            seed=seed,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            vocab_size=vocab_size,
+            seq_length=seq_length,
+            num_experts=num_experts,
+            fine_grained_activation_offloading=False,
+            offload_modules=None,
+            min_offloaded_tensor_size=1024 * 1024,
+            is_mla=False,
+            cuda_graph_impl="transformer_engine",
+            cuda_graph_scope=cuda_graph_scope,
+            cuda_graph_warmup_steps=cuda_graph_warmup_steps,
+            enable_hyper_connections=True,
+            num_residual_streams=4,
+        ).cuda()
+        base_model.train()
+
+        base_logits, base_grads, base_peak = _run_iters_with_cuda_graph(
+            base_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            num_warmup_iters=cuda_graph_warmup_steps,
+            num_measure_iters=2,
+            enable_offload_reset=False,
+        )
+        del base_model
+        _reset_cuda_memory()
+
+        # 2) Test: mHC + CUDA graph enabled + offloading enabled
+        off_interface.reset_instance()
+
+        off_model = _build_gpt_model_with_cuda_graph(
+            seed=seed,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            vocab_size=vocab_size,
+            seq_length=seq_length,
+            num_experts=num_experts,
+            fine_grained_activation_offloading=True,
+            offload_modules=offload_modules,
+            min_offloaded_tensor_size=1024,
+            is_mla=False,
+            cuda_graph_impl="transformer_engine",
+            cuda_graph_scope=cuda_graph_scope,
+            cuda_graph_warmup_steps=cuda_graph_warmup_steps,
+            delay_offload_until_cuda_graph=delay_offload,
+            enable_hyper_connections=True,
+            num_residual_streams=4,
+        ).cuda()
+        off_model.train()
+
+        off_logits, off_grads, off_peak = _run_iters_with_cuda_graph(
+            off_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            num_warmup_iters=cuda_graph_warmup_steps,
+            num_measure_iters=2,
+            enable_offload_reset=True,
+        )
+        del off_model
+        _reset_cuda_memory()
+
+        # 3) Correctness checks
+        assert torch.allclose(
+            off_logits, base_logits, rtol=1e-2, atol=1e-2
+        ), f"mHC logits mismatch: max_diff={torch.max(torch.abs(off_logits - base_logits))}"
+        assert set(off_grads.keys()) == set(base_grads.keys())
+        for name, gb in base_grads.items():
+            go = off_grads[name]
+            if gb is None or go is None:
+                assert gb is None and go is None, f"Grad None mismatch for {name}"
+                continue
+            assert torch.allclose(
+                go, gb, rtol=1e-2, atol=1e-2
+            ), f"mHC grad mismatch for {name}: max_diff={torch.max(torch.abs(go - gb))}"
+
+        # 4) Memory and sanity checks
+        saved_mib = (base_peak - off_peak) / (1024**2)
+        print(
+            f"mHC + CUDA Graph + Offload test (delay={delay_offload}): "
+            f"base_peak={base_peak/(1024**2):.2f}MiB, "
+            f"off_peak={off_peak/(1024**2):.2f}MiB, "
+            f"saved={saved_mib:.2f}MiB"
+        )
+
+        assert not torch.isnan(off_logits).any(), "NaN detected in mHC logits"
+        assert not torch.isinf(off_logits).any(), "Inf detected in mHC logits"
+
+        for name, g in off_grads.items():
+            if g is not None:
+                assert not torch.isnan(g).any(), f"NaN detected in mHC grad for {name}"
+                assert not torch.isinf(g).any(), f"Inf detected in mHC grad for {name}"
+
+        assert saved_mib >= -CUDA_GRAPH_DELTA, (
+            f"mHC offloading with CUDA graph significantly increased memory: "
+            f"saved={saved_mib:.2f}MiB (negative means increase)"
+        )
+
+    finally:
+        Utils.destroy_model_parallel()
+
+
+@pytest.mark.flaky_in_dev
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is required for offloading tests.")
+def test_mhc_recompute_with_non_conflicting_offload_modules():
+    """
+    Test that mHC recompute ('mhc' in recompute_modules) works correctly with
+    offload modules that do NOT overlap with mHC checkpoints.
+
+    mHC checkpoints wrap input_layernorm (inside attn_norm) and pre_mlp_layernorm
+    (inside mlp_norm). Other offload modules (qkv_linear, core_attn, attn_proj,
+    expert_fc1, moe_act) live inside self_attention/MLP which are not wrapped by
+    mHC checkpoints, so there is no backward hook ordering conflict.
+    """
+    offload_modules = ["qkv_linear", "core_attn", "attn_proj", "expert_fc1", "moe_act"]
+
+    os.environ.pop("NVTE_FUSED_ATTN", None)
+    os.environ.pop("NVTE_FLASH_ATTN", None)
+    os.environ.pop("NVTE_UNFUSED_ATTN", None)
+    Utils.initialize_model_parallel(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
+
+    seed = 123
+    num_experts = 4
+    num_layers = 8
+    hidden_size = 1024
+    num_attention_heads = 8
+    vocab_size = 1024
+    seq_length = 1024
+    micro_batch_size = 2
+    device = torch.device("cuda")
+
+    input_ids, position_ids, attention_mask = _make_gpt_inputs(
+        seq_length=seq_length, micro_batch_size=micro_batch_size, device=device
+    )
+
+    off_interface.reset_instance()
+
+    try:
+        # 1) Baseline: mHC + mhc recompute, no offloading
+        _reset_cuda_memory()
+        base_model = _build_gpt_model(
+            seed=seed,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            vocab_size=vocab_size,
+            seq_length=seq_length,
+            num_experts=num_experts,
+            fine_grained_activation_offloading=False,
+            offload_modules=None,
+            min_offloaded_tensor_size=1024 * 1024,
+            is_mla=False,
+            enable_hyper_connections=True,
+            mhc_recompute_layer_num=2,
+        ).cuda()
+        base_model.train()
+
+        _run_one_iter_and_capture(
+            base_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            enable_offload_reset=False,
+        )
+        _reset_cuda_memory()
+        base_logits, base_grads, base_peak = _run_one_iter_and_capture(
+            base_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            enable_offload_reset=False,
+        )
+        del base_model
+        _reset_cuda_memory()
+
+        # 2) mHC + mhc recompute + offloading (non-conflicting modules only)
+        off_model = _build_gpt_model(
+            seed=seed,
+            num_layers=num_layers,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            vocab_size=vocab_size,
+            seq_length=seq_length,
+            num_experts=num_experts,
+            fine_grained_activation_offloading=True,
+            offload_modules=offload_modules,
+            min_offloaded_tensor_size=1024,
+            is_mla=False,
+            enable_hyper_connections=True,
+            mhc_recompute_layer_num=2,
+        ).cuda()
+        off_model.train()
+
+        _run_one_iter_and_capture(
+            off_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            enable_offload_reset=True,
+        )
+        off_interface.reset()
+
+        _reset_cuda_memory()
+        off_logits, off_grads, off_peak = _run_one_iter_and_capture(
+            off_model,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            enable_offload_reset=True,
+        )
+        del off_model
+        _reset_cuda_memory()
+
+        # 3) Correctness checks
+        assert torch.allclose(off_logits, base_logits, rtol=1e-3, atol=1e-3), (
+            f"mHC recompute + offload logits mismatch: "
+            f"max_diff={torch.max(torch.abs(off_logits - base_logits))}"
+        )
+        assert set(off_grads.keys()) == set(base_grads.keys())
+        for name, gb in base_grads.items():
+            go = off_grads[name]
+            if gb is None or go is None:
+                assert gb is None and go is None, f"Grad None mismatch for {name}"
+                continue
+            assert torch.allclose(go, gb, rtol=1e-3, atol=1e-3), (
+                f"mHC recompute + offload grad mismatch for {name}: "
+                f"max_diff={torch.max(torch.abs(go - gb))}"
+            )
+
+        # 4) Memory checks
+        saved_mib = (base_peak - off_peak) / (1024**2)
+        assert saved_mib > 0.0, (
+            f"Expected GPU peak memory reduction for offload_modules={offload_modules}, "
+            f"but got saved={saved_mib:.2f}MiB"
+        )
+        print(f"mHC recompute + offload ({offload_modules}): " f"saved={saved_mib:.2f}MiB")
+
+    finally:
+        Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/pipeline_parallel/test_pp_mhc_compatibility.py b/tests/unit_tests/pipeline_parallel/test_pp_mhc_compatibility.py
new file mode 100644
index 00000000000..eda8ffe7df4
--- /dev/null
+++ b/tests/unit_tests/pipeline_parallel/test_pp_mhc_compatibility.py
@@ -0,0 +1,1123 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""
+Unit tests for PP / VPP + mHC (Hyper Connections) compatibility.
+
+Tests cover:
+1. get_tensor_shapes: shape correctness with mHC for all PP stages
+2. get_num_layers_to_build: layer counts with standalone embedding/loss + mHC
+3. TransformerBlock expand/contract: correct placement at PP boundaries
+4. VPP tensor_shape: single shape used across all chunks with mHC
+5. E2E forward pass: PP + mHC + standalone embedding/loss (multi-GPU)
+6. Flexible VPP layout (pipeline_model_parallel_layout) + mHC compatibility
+
+Run with:
+    uv run --no-sync pytest tests/unit_tests/pipeline_parallel/test_pp_mhc_compatibility.py -s -x
+    # Multi-GPU tests (world_size >= 2):
+    torchrun --nproc-per-node=2 -m pytest tests/unit_tests/pipeline_parallel/test_pp_mhc_compatibility.py -s -x
+"""
+
+from types import SimpleNamespace
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.pipeline_parallel.schedules import get_tensor_shapes
+from megatron.core.transformer.hyper_connection import HyperConnectionModule
+from megatron.core.transformer.transformer_block import get_num_layers_to_build
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_pp_group(rank: int, size: int):
+    """Create a mock PP process group with given rank and size."""
+    pg = MagicMock()
+    pg.rank.return_value = rank
+    pg.size.return_value = size
+    return pg
+
+
+def _make_tp_cp_groups(tp_size: int = 1, cp_size: int = 1):
+    tp = MagicMock()
+    tp.size.return_value = tp_size
+    cp = MagicMock()
+    cp.size.return_value = cp_size
+    return tp, cp
+
+
+def _get_send_recv_shapes(config, pp_size, seq=32, mbs=2):
+    """Get (send_shape, recv_shape) for each PP rank."""
+    tp, cp = _make_tp_cp_groups()
+    results = []
+    for rank in range(pp_size):
+        send = get_tensor_shapes(
+            seq_length=seq,
+            micro_batch_size=mbs,
+            decoder_seq_length=None,
+            config=config,
+            tp_group=tp,
+            cp_group=cp,
+            pp_group=_make_pp_group(rank, pp_size),
+            is_recv=False,
+        )
+        recv = get_tensor_shapes(
+            seq_length=seq,
+            micro_batch_size=mbs,
+            decoder_seq_length=None,
+            config=config,
+            tp_group=tp,
+            cp_group=cp,
+            pp_group=_make_pp_group(rank, pp_size),
+            is_recv=True,
+        )
+        results.append((send, recv))
+    return results
+
+
+def _make_config(
+    hidden_size=64,
+    num_layers=8,
+    pp_size=2,
+    vp_size=None,
+    enable_hyper_connections=False,
+    num_residual_streams=4,
+    account_for_embedding=False,
+    account_for_loss=False,
+    num_layers_first=None,
+    num_layers_last=None,
+    **extra,
+):
+    """Build a TransformerConfig for testing without initializing parallel state."""
+    kwargs = dict(
+        hidden_size=hidden_size,
+        num_layers=num_layers,
+        num_attention_heads=4,
+        pipeline_model_parallel_size=pp_size,
+        virtual_pipeline_model_parallel_size=vp_size,
+        enable_hyper_connections=enable_hyper_connections,
+        num_residual_streams=num_residual_streams,
+        account_for_embedding_in_pipeline_split=account_for_embedding,
+        account_for_loss_in_pipeline_split=account_for_loss,
+        num_layers_in_first_pipeline_stage=num_layers_first,
+        num_layers_in_last_pipeline_stage=num_layers_last,
+        use_cpu_initialization=True,
+    )
+    if pp_size > 1:
+        kwargs.setdefault('pipeline_dtype', torch.bfloat16)
+    kwargs.update(extra)
+    return TransformerConfig(**kwargs)
+
+
+# ===========================================================================
+# 1. get_tensor_shapes — shape correctness with mHC
+# ===========================================================================
+
+
+class TestGetTensorShapesWithMHC:
+    """Verify get_tensor_shapes returns correct hidden dim for mHC-enabled models."""
+
+    SEQ, MBS, H = 32, 2, 64
+    N_STREAMS = 4
+
+    def _shapes(self, config, pp_rank, pp_size, is_recv):
+        tp, cp = _make_tp_cp_groups()
+        pp = _make_pp_group(pp_rank, pp_size)
+        return get_tensor_shapes(
+            seq_length=self.SEQ,
+            micro_batch_size=self.MBS,
+            decoder_seq_length=None,
+            config=config,
+            tp_group=tp,
+            cp_group=cp,
+            pp_group=pp,
+            is_recv=is_recv,
+        )
+
+    # --- Without mHC (baseline) ---
+
+    def test_no_mhc_pp2_all_stages(self):
+        cfg = _make_config(hidden_size=self.H, pp_size=2, enable_hyper_connections=False)
+        for rank in range(2):
+            for is_recv in (True, False):
+                shapes = self._shapes(cfg, rank, 2, is_recv)
+                assert shapes == [(self.SEQ, self.MBS, self.H)]
+
+    # --- With mHC, PP=2 ---
+
+    def test_mhc_pp2_rank0_send_nstream(self):
+        """PP rank 0 sends n*C to rank 1."""
+        cfg = _make_config(
+            hidden_size=self.H,
+            pp_size=2,
+            enable_hyper_connections=True,
+            num_residual_streams=self.N_STREAMS,
+        )
+        shapes = self._shapes(cfg, pp_rank=0, pp_size=2, is_recv=False)
+        assert shapes == [(self.SEQ, self.MBS, self.H * self.N_STREAMS)]
+
+    def test_mhc_pp2_rank0_recv_1stream(self):
+        """PP rank 0 receives nothing from previous (is first stage), so shape = C."""
+        cfg = _make_config(
+            hidden_size=self.H,
+            pp_size=2,
+            enable_hyper_connections=True,
+            num_residual_streams=self.N_STREAMS,
+        )
+        shapes = self._shapes(cfg, pp_rank=0, pp_size=2, is_recv=True)
+        assert shapes == [(self.SEQ, self.MBS, self.H)]
+
+    def test_mhc_pp2_rank1_recv_nstream(self):
+        """PP rank 1 receives n*C from rank 0."""
+        cfg = _make_config(
+            hidden_size=self.H,
+            pp_size=2,
+            enable_hyper_connections=True,
+            num_residual_streams=self.N_STREAMS,
+        )
+        shapes = self._shapes(cfg, pp_rank=1, pp_size=2, is_recv=True)
+        assert shapes == [(self.SEQ, self.MBS, self.H * self.N_STREAMS)]
+
+    def test_mhc_pp2_rank1_send_1stream(self):
+        """PP rank 1 (last stage) sends C (after output_contract)."""
+        cfg = _make_config(
+            hidden_size=self.H,
+            pp_size=2,
+            enable_hyper_connections=True,
+            num_residual_streams=self.N_STREAMS,
+        )
+        shapes = self._shapes(cfg, pp_rank=1, pp_size=2, is_recv=False)
+        assert shapes == [(self.SEQ, self.MBS, self.H)]
+
+    # --- With mHC, PP=4 (intermediate ranks) ---
+
+    def test_mhc_pp4_intermediate_ranks(self):
+        """Intermediate ranks both send and receive n*C."""
+        cfg = _make_config(
+            hidden_size=self.H,
+            pp_size=4,
+            num_layers=8,
+            enable_hyper_connections=True,
+            num_residual_streams=self.N_STREAMS,
+        )
+        for rank in (1, 2):
+            for is_recv in (True, False):
+                shapes = self._shapes(cfg, pp_rank=rank, pp_size=4, is_recv=is_recv)
+                assert shapes == [
+                    (self.SEQ, self.MBS, self.H * self.N_STREAMS)
+                ], f"rank={rank}, is_recv={is_recv}"
+
+    # --- With sequence parallel ---
+
+    def test_mhc_with_sequence_parallel(self):
+        """Sequence parallel divides seq_length by TP size."""
+        cfg = _make_config(
+            hidden_size=self.H,
+            pp_size=2,
+            enable_hyper_connections=True,
+            num_residual_streams=self.N_STREAMS,
+            sequence_parallel=True,
+            tensor_model_parallel_size=2,
+        )
+        tp, cp = _make_tp_cp_groups(tp_size=2)
+        pp = _make_pp_group(0, 2)
+        shapes = get_tensor_shapes(
+            seq_length=self.SEQ,
+            micro_batch_size=self.MBS,
+            decoder_seq_length=None,
+            config=cfg,
+            tp_group=tp,
+            cp_group=cp,
+            pp_group=pp,
+            is_recv=False,
+        )
+        assert shapes == [(self.SEQ // 2, self.MBS, self.H * self.N_STREAMS)]
+
+
+# ===========================================================================
+# 2. get_num_layers_to_build — mHC + standalone embedding/loss
+# ===========================================================================
+
+
+class TestGetNumLayersToBuilWithMHC:
+    """
+    Verify layer counts are correct when mHC is combined with standalone
+    embedding / loss stages (account_for_embedding/loss_in_pipeline_split).
+    mHC itself doesn't change layer counts, but we need to ensure the
+    combination doesn't break.
+    """
+
+    def test_pp2_even_split_mhc(self):
+        cfg = _make_config(num_layers=8, pp_size=2, enable_hyper_connections=True)
+        assert get_num_layers_to_build(cfg, pp_rank=0) == 4
+        assert get_num_layers_to_build(cfg, pp_rank=1) == 4
+
+    def test_pp2_standalone_embedding_mhc(self):
+        """With standalone embedding on PP rank 0, rank 0 builds fewer layers."""
+        cfg = _make_config(
+            num_layers=8,
+            pp_size=2,
+            enable_hyper_connections=True,
+            account_for_embedding=True,
+            account_for_loss=True,
+        )
+        # (8 + 1 + 1) / 2 = 5 per rank
+        # rank 0: 5 - 1 (embedding) = 4 transformer layers
+        # rank 1: 5 - 1 (loss) = 4 transformer layers
+        assert get_num_layers_to_build(cfg, pp_rank=0) == 4
+        assert get_num_layers_to_build(cfg, pp_rank=1) == 4
+
+    def test_pp4_standalone_invalid_division_raises(self):
+        """PP=4, standalone embedding+loss, 12 layers → (12+2)/4=3.5 → raises."""
+        with pytest.raises((ValueError, AssertionError)):
+            _make_config(
+                num_layers=12,
+                pp_size=4,
+                enable_hyper_connections=True,
+                account_for_embedding=True,
+                account_for_loss=True,
+            )
+
+    def test_pp4_standalone_both_mhc_valid(self):
+        """Valid configuration: (14+2)/4 = 4 per rank."""
+        cfg = _make_config(
+            num_layers=14,
+            pp_size=4,
+            enable_hyper_connections=True,
+            account_for_embedding=True,
+            account_for_loss=True,
+        )
+        # rank 0: 4 - 1 (embedding) = 3
+        # rank 1, 2: 4
+        # rank 3: 4 - 1 (loss) = 3
+        assert get_num_layers_to_build(cfg, pp_rank=0) == 3
+        assert get_num_layers_to_build(cfg, pp_rank=1) == 4
+        assert get_num_layers_to_build(cfg, pp_rank=2) == 4
+        assert get_num_layers_to_build(cfg, pp_rank=3) == 3
+
+    def test_uneven_pp_with_mhc(self):
+        """Uneven PP: first stage has 2 layers, last has 2, middle gets 2 each."""
+        cfg = _make_config(
+            num_layers=8,
+            pp_size=4,
+            enable_hyper_connections=True,
+            num_layers_first=2,
+            num_layers_last=2,
+        )
+        assert get_num_layers_to_build(cfg, pp_rank=0) == 2
+        assert get_num_layers_to_build(cfg, pp_rank=1) == 2
+        assert get_num_layers_to_build(cfg, pp_rank=2) == 2
+        assert get_num_layers_to_build(cfg, pp_rank=3) == 2
+
+    def test_vpp_with_mhc(self):
+        """VPP=2 with mHC: each VP stage gets half the layers per rank."""
+        cfg = _make_config(num_layers=8, pp_size=2, vp_size=2, enable_hyper_connections=True)
+        for pp_rank in range(2):
+            for vp_stage in range(2):
+                n = get_num_layers_to_build(cfg, vp_stage=vp_stage, pp_rank=pp_rank)
+                assert n == 2, f"pp_rank={pp_rank}, vp_stage={vp_stage}, got {n}"
+
+    def test_vpp_standalone_embedding_loss_invalid_raises(self):
+        """VPP=2, standalone embedding+loss, pp=2, 8 layers → 10/2=5, 5%2!=0 → raises."""
+        with pytest.raises((ValueError, AssertionError)):
+            _make_config(
+                num_layers=8,
+                pp_size=2,
+                vp_size=2,
+                enable_hyper_connections=True,
+                account_for_embedding=True,
+                account_for_loss=True,
+            )
+
+    def test_vpp_standalone_both_valid_mhc(self):
+        """VPP=2, standalone embed+loss, pp=4, 14 layers → (14+2)/4=4, 4/2=2 per VP."""
+        cfg = _make_config(
+            num_layers=14,
+            pp_size=4,
+            vp_size=2,
+            enable_hyper_connections=True,
+            account_for_embedding=True,
+            account_for_loss=True,
+        )
+        # rank 0, vp 0: first PP + first VP → 2 - 1(embed) = 1
+        assert get_num_layers_to_build(cfg, vp_stage=0, pp_rank=0) == 1
+        # rank 0, vp 1: first PP + second VP → 2
+        assert get_num_layers_to_build(cfg, vp_stage=1, pp_rank=0) == 2
+        # rank 1-2: 2 per VP stage
+        for rank in (1, 2):
+            for vp in (0, 1):
+                assert get_num_layers_to_build(cfg, vp_stage=vp, pp_rank=rank) == 2
+        # rank 3, vp 0: 2
+        assert get_num_layers_to_build(cfg, vp_stage=0, pp_rank=3) == 2
+        # rank 3, vp 1: last PP + last VP → 2 - 1(loss) = 1
+        assert get_num_layers_to_build(cfg, vp_stage=1, pp_rank=3) == 1
+
+
+# ===========================================================================
+# 3. TransformerBlock expand/contract — boundary logic
+# ===========================================================================
+
+
+class TestTransformerBlockMHCBoundaries:
+    """
+    Test that TransformerBlock correctly applies input_expand at pre_process
+    and output_contract at the final layernorm stage.
+    These are pure tensor operation tests — no GPU or parallel state needed.
+    """
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_input_expand(self):
+        n = 4
+        s, b, C = 8, 2, 64
+        x = torch.randn(s, b, C, device='cuda')
+        expanded = HyperConnectionModule.input_expand(x, n)
+        assert expanded.shape == (s, b, n * C)
+        # Each stream should be a copy of input
+        for i in range(n):
+            torch.testing.assert_close(expanded[:, :, i * C : (i + 1) * C], x)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_output_contract(self):
+        n = 4
+        s, b, C = 8, 2, 64
+        x = torch.randn(s, b, n * C, device='cuda')
+        contracted = HyperConnectionModule.output_contract(x, n)
+        assert contracted.shape == (s, b, C)
+        # Should be the mean of all n streams
+        expected = x.view(s, b, n, C).mean(dim=2)
+        torch.testing.assert_close(contracted, expected)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_expand_then_contract_preserves_shape(self):
+        n = 4
+        s, b, C = 8, 2, 64
+        x = torch.randn(s, b, C, device='cuda')
+        expanded = HyperConnectionModule.input_expand(x, n)
+        contracted = HyperConnectionModule.output_contract(expanded, n)
+        assert contracted.shape == x.shape
+        # expand copies all streams → mean of identical streams = original
+        torch.testing.assert_close(contracted, x)
+
+
+# ===========================================================================
+# 3b. Zero-layer VP stage edge cases with mHC
+# ===========================================================================
+
+
+class TestZeroLayerVPStageWithMHC:
+    """
+    When standalone embedding/loss makes a VP stage have very few (1) transformer
+    layers, verify layer counts stay non-negative.
+    """
+
+    def test_vpp_standalone_embed_first_stage_has_1_layer(self):
+        """First VP stage at first PP rank should have exactly 1 layer (2-1=1)."""
+        cfg = _make_config(
+            num_layers=7,
+            pp_size=2,
+            vp_size=2,
+            enable_hyper_connections=True,
+            account_for_embedding=True,
+        )
+        n = get_num_layers_to_build(cfg, vp_stage=0, pp_rank=0)
+        assert n == 1
+        assert n >= 0
+
+    def test_vpp_standalone_loss_last_stage_has_1_layer(self):
+        """Last VP stage at last PP rank should have exactly 1 layer (2-1=1)."""
+        cfg = _make_config(
+            num_layers=7, pp_size=2, vp_size=2, enable_hyper_connections=True, account_for_loss=True
+        )
+        n = get_num_layers_to_build(cfg, vp_stage=1, pp_rank=1)
+        assert n == 1
+        assert n >= 0
+
+    def test_vpp_standalone_both_boundary_layers(self):
+        """Both first and last VP stages lose a layer, but all counts remain >= 0."""
+        cfg = _make_config(
+            num_layers=14,
+            pp_size=4,
+            vp_size=2,
+            enable_hyper_connections=True,
+            account_for_embedding=True,
+            account_for_loss=True,
+        )
+        for pp_rank in range(4):
+            for vp_stage in range(2):
+                n = get_num_layers_to_build(cfg, vp_stage=vp_stage, pp_rank=pp_rank)
+                assert n >= 0, f"pp_rank={pp_rank}, vp_stage={vp_stage} has {n} < 0 layers"
+
+
+# ===========================================================================
+# 4. VPP tensor_shape — single shape for all chunks
+# ===========================================================================
+
+
+class TestVPPTensorShapeWithMHC:
+    """
+    Verify that the interleaved schedule uses n*C for all P2P communication
+    when mHC is enabled with PP > 1.
+    """
+
+    def test_interleaved_tensor_shape_uses_nstream(self):
+        """Reproduce the logic in forward_backward_pipelining_with_interleaving."""
+        hidden_size = 64
+        n_streams = 4
+        pp_size = 2
+
+        config = SimpleNamespace(
+            hidden_size=hidden_size,
+            enable_hyper_connections=True,
+            num_residual_streams=n_streams,
+            sequence_parallel=False,
+        )
+
+        hidden_dim = config.hidden_size
+        if getattr(config, 'enable_hyper_connections', False) and pp_size > 1:
+            hidden_dim = config.hidden_size * getattr(config, 'num_residual_streams', 1)
+
+        assert hidden_dim == hidden_size * n_streams
+
+    def test_interleaved_tensor_shape_no_mhc(self):
+        """Without mHC, hidden_dim = hidden_size."""
+        hidden_size = 64
+        pp_size = 2
+
+        config = SimpleNamespace(
+            hidden_size=hidden_size, enable_hyper_connections=False, sequence_parallel=False
+        )
+
+        hidden_dim = config.hidden_size
+        if getattr(config, 'enable_hyper_connections', False) and pp_size > 1:
+            hidden_dim = config.hidden_size * getattr(config, 'num_residual_streams', 1)
+
+        assert hidden_dim == hidden_size
+
+    def test_interleaved_tensor_shape_pp1_mhc_no_expand(self):
+        """PP=1 with mHC: no P2P communication needed, no shape change."""
+        hidden_size = 64
+        n_streams = 4
+        pp_size = 1
+
+        config = SimpleNamespace(
+            hidden_size=hidden_size,
+            enable_hyper_connections=True,
+            num_residual_streams=n_streams,
+            sequence_parallel=False,
+        )
+
+        hidden_dim = config.hidden_size
+        if getattr(config, 'enable_hyper_connections', False) and pp_size > 1:
+            hidden_dim = config.hidden_size * getattr(config, 'num_residual_streams', 1)
+
+        assert hidden_dim == hidden_size
+
+
+# ===========================================================================
+# 5. Shape consistency across PP stages with VPP + mHC
+# ===========================================================================
+
+
+class TestPPShapeConsistencyWithMHC:
+    """
+    Verify that send shape from one stage matches recv shape of the next stage.
+    This is critical: a mismatch would cause a hang or crash in P2P communication.
+    """
+
+    def test_pp2_mhc_send_recv_match(self):
+        """Rank 0's send shape must match rank 1's recv shape."""
+        cfg = _make_config(hidden_size=64, pp_size=2, enable_hyper_connections=True)
+        shapes = _get_send_recv_shapes(cfg, 2)
+        assert (
+            shapes[0][0] == shapes[1][1]
+        ), f"rank 0 send {shapes[0][0]} != rank 1 recv {shapes[1][1]}"
+
+    def test_pp4_mhc_all_consecutive_match(self):
+        """For all consecutive stages, send[i] == recv[i+1]."""
+        cfg = _make_config(hidden_size=64, num_layers=8, pp_size=4, enable_hyper_connections=True)
+        shapes = _get_send_recv_shapes(cfg, 4)
+        for i in range(3):
+            assert (
+                shapes[i][0] == shapes[i + 1][1]
+            ), f"rank {i} send {shapes[i][0]} != rank {i+1} recv {shapes[i+1][1]}"
+
+    def test_pp4_no_mhc_all_consecutive_match(self):
+        """Baseline: without mHC, all shapes should be plain hidden_size."""
+        cfg = _make_config(hidden_size=64, num_layers=8, pp_size=4)
+        shapes = _get_send_recv_shapes(cfg, 4)
+        for i in range(3):
+            assert shapes[i][0] == shapes[i + 1][1]
+            assert shapes[i][0] == [(32, 2, 64)]
+
+
+# ===========================================================================
+# 6. Standalone embedding / loss — PP boundary + mHC interaction
+# ===========================================================================
+
+
+class TestStandaloneEmbeddingLossWithMHC:
+    """
+    Verify that standalone embedding/loss configurations interact correctly
+    with mHC tensor shapes and layer counting.
+    """
+
+    def test_standalone_embedding_first_stage_has_fewer_layers(self):
+        """With standalone embedding, first PP/VP stage gets 1 fewer layer."""
+        # 7 layers, pp=2, vp=2 → (7+1)/2=4, 4/2=2 per VP stage
+        cfg = _make_config(
+            num_layers=7,
+            pp_size=2,
+            vp_size=2,
+            enable_hyper_connections=True,
+            account_for_embedding=True,
+        )
+        # rank 0, vp 0: first stage → 2 - 1(embed) = 1
+        assert get_num_layers_to_build(cfg, vp_stage=0, pp_rank=0) == 1
+        # rank 0, vp 1: 2
+        assert get_num_layers_to_build(cfg, vp_stage=1, pp_rank=0) == 2
+        # rank 1: 2 each VP
+        assert get_num_layers_to_build(cfg, vp_stage=0, pp_rank=1) == 2
+        assert get_num_layers_to_build(cfg, vp_stage=1, pp_rank=1) == 2
+
+    def test_standalone_loss_last_stage_has_fewer_layers(self):
+        """With standalone loss, last PP/VP stage gets 1 fewer layer."""
+        cfg = _make_config(
+            num_layers=7, pp_size=2, vp_size=2, enable_hyper_connections=True, account_for_loss=True
+        )
+        # (7+1)/2 = 4, 4/2 = 2 per VP
+        # rank 0: 2 each VP
+        assert get_num_layers_to_build(cfg, vp_stage=0, pp_rank=0) == 2
+        assert get_num_layers_to_build(cfg, vp_stage=1, pp_rank=0) == 2
+        # rank 1, vp 0: 2
+        assert get_num_layers_to_build(cfg, vp_stage=0, pp_rank=1) == 2
+        # rank 1, vp 1: last stage → 2 - 1(loss) = 1
+        assert get_num_layers_to_build(cfg, vp_stage=1, pp_rank=1) == 1
+
+    def test_standalone_both_mhc_shapes_still_consistent(self):
+        """With standalone embed+loss, P2P shapes should still match between stages."""
+        cfg = _make_config(
+            hidden_size=64,
+            num_layers=14,
+            pp_size=4,
+            enable_hyper_connections=True,
+            num_residual_streams=4,
+            account_for_embedding=True,
+            account_for_loss=True,
+        )
+        tp, cp = _make_tp_cp_groups()
+        for i in range(3):
+            send = get_tensor_shapes(
+                seq_length=32,
+                micro_batch_size=2,
+                decoder_seq_length=None,
+                config=cfg,
+                tp_group=tp,
+                cp_group=cp,
+                pp_group=_make_pp_group(i, 4),
+                is_recv=False,
+            )
+            recv = get_tensor_shapes(
+                seq_length=32,
+                micro_batch_size=2,
+                decoder_seq_length=None,
+                config=cfg,
+                tp_group=tp,
+                cp_group=cp,
+                pp_group=_make_pp_group(i + 1, 4),
+                is_recv=True,
+            )
+            assert send == recv, f"rank {i}→{i+1}: send={send} recv={recv}"
+
+    def test_mhc_shapes_first_stage_send_vs_second_recv(self):
+        """
+        First stage (pre_process) does input_expand: hidden [s,b,C] → [s,b,n*C].
+        The send shape from rank 0 should be n*C.
+        The recv shape at rank 1 should also be n*C.
+        """
+        H, N = 64, 4
+        cfg = _make_config(
+            hidden_size=H,
+            num_layers=8,
+            pp_size=2,
+            enable_hyper_connections=True,
+            num_residual_streams=N,
+        )
+        tp, cp = _make_tp_cp_groups()
+        send_0 = get_tensor_shapes(
+            seq_length=32,
+            micro_batch_size=2,
+            decoder_seq_length=None,
+            config=cfg,
+            tp_group=tp,
+            cp_group=cp,
+            pp_group=_make_pp_group(0, 2),
+            is_recv=False,
+        )
+        recv_1 = get_tensor_shapes(
+            seq_length=32,
+            micro_batch_size=2,
+            decoder_seq_length=None,
+            config=cfg,
+            tp_group=tp,
+            cp_group=cp,
+            pp_group=_make_pp_group(1, 2),
+            is_recv=True,
+        )
+        assert send_0 == [(32, 2, H * N)]
+        assert recv_1 == [(32, 2, H * N)]
+        assert send_0 == recv_1
+
+    def test_mhc_shapes_last_stage_output_is_1stream(self):
+        """
+        Last stage (post_process) does output_contract: [s,b,n*C] → [s,b,C].
+        The send shape from last rank should be C (but get_tensor_shapes returns C
+        because last rank doesn't send forward).
+        """
+        H, N = 64, 4
+        cfg = _make_config(
+            hidden_size=H,
+            num_layers=8,
+            pp_size=2,
+            enable_hyper_connections=True,
+            num_residual_streams=N,
+        )
+        tp, cp = _make_tp_cp_groups()
+        send_last = get_tensor_shapes(
+            seq_length=32,
+            micro_batch_size=2,
+            decoder_seq_length=None,
+            config=cfg,
+            tp_group=tp,
+            cp_group=cp,
+            pp_group=_make_pp_group(1, 2),
+            is_recv=False,
+        )
+        # Last stage sends C (after contract), not n*C
+        assert send_last == [(32, 2, H)]
+
+
+# ===========================================================================
+# 7. E2E forward pass tests (require multi-GPU)
+# ===========================================================================
+
+
+@pytest.mark.internal
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(
+    int(__import__('os').environ.get('WORLD_SIZE', '1')) < 2, reason="Requires at least 2 GPUs"
+)
+class TestPPForwardWithMHC:
+    """
+    End-to-end forward pass tests with PP + mHC.
+    Requires multi-GPU (torchrun --nproc-per-node=2+).
+    """
+
+    def _run_forward(
+        self, pp_size, vp_size, enable_mhc, account_for_embedding=False, account_for_loss=False
+    ):
+        from megatron.core import mpu
+        from megatron.core.models.gpt.gpt_layer_specs import (
+            get_gpt_layer_with_transformer_engine_spec,
+        )
+        from megatron.core.models.gpt.gpt_model import GPTModel
+        from megatron.core.num_microbatches_calculator import (
+            init_num_microbatches_calculator,
+            unset_num_microbatches_calculator,
+        )
+        from megatron.core.pipeline_parallel import get_forward_backward_func
+        from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+        from megatron.core.transformer.enums import ModelType
+        from megatron.training.global_vars import set_args
+        from tests.unit_tests.test_utilities import Utils
+
+        num_layers = 8
+        hidden_size = 64
+        num_heads = 4
+        seq_length = 16
+        micro_batch_size = 2
+        vocab_size = 128
+
+        Utils.initialize_model_parallel(1, pp_size, vp_size)
+        model_parallel_cuda_manual_seed(42)
+        init_num_microbatches_calculator(0, None, 1, 1, 1)
+
+        try:
+            config = TransformerConfig(
+                num_layers=num_layers,
+                hidden_size=hidden_size,
+                num_attention_heads=num_heads,
+                use_cpu_initialization=True,
+                pipeline_dtype=torch.bfloat16,
+                bf16=True,
+                pipeline_model_parallel_size=pp_size,
+                virtual_pipeline_model_parallel_size=vp_size,
+                enable_hyper_connections=enable_mhc,
+                num_residual_streams=4 if enable_mhc else 1,
+                account_for_embedding_in_pipeline_split=account_for_embedding,
+                account_for_loss_in_pipeline_split=account_for_loss,
+                hidden_dropout=0.0,
+                attention_dropout=0.0,
+            )
+
+            spec = get_gpt_layer_with_transformer_engine_spec(enable_hyper_connection=enable_mhc)
+
+            models = []
+            for i in range(vp_size or 1):
+                pre_process = mpu.is_pipeline_first_stage(ignore_virtual=False, vp_stage=i)
+                post_process = mpu.is_pipeline_last_stage(ignore_virtual=False, vp_stage=i)
+                m = (
+                    GPTModel(
+                        config=config,
+                        transformer_layer_spec=spec,
+                        vocab_size=vocab_size,
+                        max_sequence_length=seq_length,
+                        pre_process=pre_process,
+                        post_process=post_process,
+                        position_embedding_type="rope",
+                        vp_stage=i,
+                        share_embeddings_and_output_weights=False,
+                    )
+                    .bfloat16()
+                    .cuda()
+                )
+                m.model_type = ModelType.encoder_or_decoder
+                models.append(m)
+
+            if vp_size is None:
+                models = models[0]
+                model_list = [models]
+            else:
+                model_list = models
+
+            def forward_step_func(data_iterator, model):
+                tokens = torch.randint(0, vocab_size, (micro_batch_size, seq_length)).cuda()
+                position_ids = (
+                    torch.arange(seq_length).unsqueeze(0).expand(micro_batch_size, -1).cuda()
+                )
+                labels = torch.randint(0, vocab_size, (micro_batch_size, seq_length)).cuda()
+                output = model(tokens, position_ids, None, labels=labels)
+
+                def loss_func(output_tensor):
+                    loss = output_tensor.sum()
+                    return output_tensor, loss
+
+                return output, loss_func
+
+            forward_backward_func = get_forward_backward_func()
+
+            def make_iter():
+                while True:
+                    yield None
+
+            data_iters = [make_iter()] * len(model_list)
+
+            losses = forward_backward_func(
+                forward_step_func=forward_step_func,
+                data_iterator=data_iters,
+                model=model_list,
+                num_microbatches=4,
+                seq_length=seq_length,
+                micro_batch_size=micro_batch_size,
+                forward_only=True,
+            )
+            return losses
+
+        finally:
+            unset_num_microbatches_calculator()
+            Utils.destroy_model_parallel()
+
+    def test_pp2_mhc_forward(self):
+        """PP=2 + mHC forward pass should not hang."""
+        self._run_forward(pp_size=2, vp_size=None, enable_mhc=True)
+
+    def test_pp2_vpp2_mhc_forward(self):
+        """PP=2 + VPP=2 + mHC forward pass should not hang."""
+        self._run_forward(pp_size=2, vp_size=2, enable_mhc=True)
+
+    def test_pp2_mhc_standalone_embedding_forward(self):
+        """PP=2 + mHC + standalone embedding."""
+        # (8+1)/2 = 4.5 → need (num_layers+1) divisible by pp_size
+        # Use default 8 layers, won't divide evenly. Skip standalone embedding
+        # with 8 layers pp=2 as (8+1)/2 isn't integer.
+        # The test framework should raise ValueError, confirming the validation.
+        with pytest.raises((ValueError, AssertionError)):
+            self._run_forward(pp_size=2, vp_size=None, enable_mhc=True, account_for_embedding=True)
+
+    def test_pp2_mhc_standalone_both_forward(self):
+        """PP=2 + mHC + standalone embedding + loss: (8+2)/2=5, works."""
+        self._run_forward(
+            pp_size=2,
+            vp_size=None,
+            enable_mhc=True,
+            account_for_embedding=True,
+            account_for_loss=True,
+        )
+
+    def test_pp2_no_mhc_forward_baseline(self):
+        """Baseline: PP=2 without mHC should work fine."""
+        self._run_forward(pp_size=2, vp_size=None, enable_mhc=False)
+
+
+# ===========================================================================
+# 8. Flexible VPP layout (pipeline_model_parallel_layout) + mHC
+# ===========================================================================
+
+
+def _make_layout_config(
+    hidden_size=64,
+    num_layers=8,
+    pp_size=2,
+    layout=None,
+    enable_hyper_connections=False,
+    num_residual_streams=4,
+    **extra,
+):
+    """Build a TransformerConfig with a flexible VPP layout for testing.
+
+    Unlike _make_config, this uses pipeline_model_parallel_layout instead of
+    account_for_embedding/loss flags, since they are mutually exclusive.
+    """
+    kwargs = dict(
+        hidden_size=hidden_size,
+        num_layers=num_layers,
+        num_attention_heads=4,
+        pipeline_model_parallel_size=pp_size,
+        pipeline_model_parallel_layout=layout,
+        pipeline_dtype=torch.bfloat16,
+        enable_hyper_connections=enable_hyper_connections,
+        num_residual_streams=num_residual_streams,
+        use_cpu_initialization=True,
+    )
+    kwargs.update(extra)
+    return TransformerConfig(**kwargs)
+
+
+class TestFlexibleVPPLayoutLayerCountsWithMHC:
+    """
+    Verify get_num_layers_to_build returns correct layer counts when
+    flexible VPP layout (pipeline_model_parallel_layout) is combined with mHC.
+    mHC itself doesn't change layer counts, so these tests confirm the
+    combination doesn't break anything.
+    """
+
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        parallel_state.set_pipeline_model_parallel_world_size(None)
+        parallel_state.set_virtual_pipeline_model_parallel_world_size(None)
+
+    def test_pp2_vpp2_standalone_embed_loss_mhc(self):
+        """PP=2, VPP=2: standalone embedding & loss on separate VP stages."""
+        # Layout: [["embedding"], ["decoder"]*6, ["decoder"], ["loss"]]
+        # PP=2, VPP=2 → 4 stages:
+        #   PP0 VP0: ["embedding"]    → 0 decoders
+        #   PP1 VP0: ["decoder"]*6    → 6 decoders
+        #   PP0 VP1: ["decoder"]      → 1 decoder
+        #   PP1 VP1: ["loss"]         → 0 decoders
+        layout = [["embedding"], ["decoder"] * 6, ["decoder"], ["loss"]]
+        Utils.fake_initialize_model_parallel(
+            pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=2
+        )
+        cfg = _make_layout_config(
+            num_layers=7,
+            pp_size=2,
+            layout=layout,
+            enable_hyper_connections=True,
+            num_residual_streams=4,
+        )
+
+        expected = {(0, 0): 0, (0, 1): 1, (1, 0): 6, (1, 1): 0}
+        total = 0
+        for pp_rank in range(2):
+            parallel_state.set_pipeline_model_parallel_rank(pp_rank)
+            for vp in range(2):
+                n = get_num_layers_to_build(cfg, vp_stage=vp)
+                assert (
+                    n == expected[(pp_rank, vp)]
+                ), f"pp_rank={pp_rank}, vp={vp}: expected {expected[(pp_rank, vp)]}, got {n}"
+                total += n
+        assert total == 7
+
+    def test_pp2_vpp2_even_split_mhc(self):
+        """PP=2, VPP=2: even split with embedding/loss attached to decoder stages."""
+        # Layout: [["embedding","decoder","decoder"], ["decoder"]*4,
+        #          ["decoder"], ["decoder","loss"]]
+        # PP0 VP0: ["embedding","decoder","decoder"] → 2 decoders
+        # PP1 VP0: ["decoder"]*4                     → 4 decoders
+        # PP0 VP1: ["decoder"]                       → 1 decoder
+        # PP1 VP1: ["decoder","loss"]                → 1 decoder
+        layout = [
+            ["embedding", "decoder", "decoder"],
+            ["decoder"] * 4,
+            ["decoder"],
+            ["decoder", "loss"],
+        ]
+        Utils.fake_initialize_model_parallel(
+            pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=2
+        )
+        cfg = _make_layout_config(
+            num_layers=8, pp_size=2, layout=layout, enable_hyper_connections=True
+        )
+
+        expected = {(0, 0): 2, (0, 1): 1, (1, 0): 4, (1, 1): 1}
+        total = 0
+        for pp_rank in range(2):
+            parallel_state.set_pipeline_model_parallel_rank(pp_rank)
+            for vp in range(2):
+                n = get_num_layers_to_build(cfg, vp_stage=vp)
+                assert (
+                    n == expected[(pp_rank, vp)]
+                ), f"pp_rank={pp_rank}, vp={vp}: expected {expected[(pp_rank, vp)]}, got {n}"
+                total += n
+        assert total == 8
+
+    def test_pp2_vpp2_empty_stage_mhc(self):
+        """PP=2, VPP=2: empty VP stage (standalone embedding) with mHC."""
+        # Layout: [["embedding"], ["decoder"]*7, [], ["loss"]]
+        # PP0 VP0: ["embedding"]  → 0 decoders
+        # PP1 VP0: ["decoder"]*7  → 7 decoders
+        # PP0 VP1: []             → 0 decoders
+        # PP1 VP1: ["loss"]       → 0 decoders
+        layout = [["embedding"], ["decoder"] * 7, [], ["loss"]]
+        Utils.fake_initialize_model_parallel(
+            pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=2
+        )
+        cfg = _make_layout_config(
+            num_layers=7, pp_size=2, layout=layout, enable_hyper_connections=True
+        )
+
+        expected = {(0, 0): 0, (0, 1): 0, (1, 0): 7, (1, 1): 0}
+        for pp_rank in range(2):
+            parallel_state.set_pipeline_model_parallel_rank(pp_rank)
+            for vp in range(2):
+                n = get_num_layers_to_build(cfg, vp_stage=vp)
+                assert n == expected[(pp_rank, vp)]
+                assert n >= 0
+
+    def test_mhc_does_not_alter_layout_layer_counts(self):
+        """Same layout gives identical layer counts with and without mHC."""
+        layout = [
+            ["embedding", "decoder", "decoder"],
+            ["decoder"] * 4,
+            ["decoder"],
+            ["decoder", "loss"],
+        ]
+        Utils.fake_initialize_model_parallel(
+            pipeline_model_parallel_size=2, virtual_pipeline_model_parallel_size=2
+        )
+        cfg_mhc = _make_layout_config(
+            num_layers=8, pp_size=2, layout=layout, enable_hyper_connections=True
+        )
+        cfg_no_mhc = _make_layout_config(
+            num_layers=8, pp_size=2, layout=layout, enable_hyper_connections=False
+        )
+
+        for pp_rank in range(2):
+            parallel_state.set_pipeline_model_parallel_rank(pp_rank)
+            for vp in range(2):
+                n_mhc = get_num_layers_to_build(cfg_mhc, vp_stage=vp)
+                n_no_mhc = get_num_layers_to_build(cfg_no_mhc, vp_stage=vp)
+                assert (
+                    n_mhc == n_no_mhc
+                ), f"pp_rank={pp_rank}, vp={vp}: mHC={n_mhc} != no-mHC={n_no_mhc}"
+
+
+class TestFlexibleVPPLayoutShapeConsistencyWithMHC:
+    """
+    Verify that P2P tensor shapes are consistent (send == recv) between
+    consecutive PP stages when using flexible VPP layout + mHC.
+    This is critical: a shape mismatch causes hangs or crashes.
+    """
+
+    def test_pp2_flexible_vpp_mhc_send_recv_match(self):
+        """PP=2 with flexible VPP layout + mHC: rank 0 send == rank 1 recv."""
+        H, N = 64, 4
+        cfg = _make_layout_config(
+            hidden_size=H,
+            num_layers=7,
+            pp_size=2,
+            layout=[["embedding"], ["decoder"] * 6, ["decoder"], ["loss"]],
+            enable_hyper_connections=True,
+            num_residual_streams=N,
+        )
+        shapes = _get_send_recv_shapes(cfg, pp_size=2)
+        assert (
+            shapes[0][0] == shapes[1][1]
+        ), f"rank 0 send {shapes[0][0]} != rank 1 recv {shapes[1][1]}"
+        # rank 0 (first) sends n*C
+        assert shapes[0][0] == [(32, 2, H * N)]
+        # rank 1 (last) sends C
+        assert shapes[1][0] == [(32, 2, H)]
+
+    def test_pp4_flexible_vpp_mhc_all_consecutive_match(self):
+        """PP=4 with flexible VPP layout + mHC: send[i] == recv[i+1] for all i."""
+        H, N = 64, 4
+        layout = [
+            ["embedding"],
+            ["decoder"] * 2,
+            ["decoder"],
+            ["decoder"],
+            ["decoder"],
+            ["decoder"],
+            ["decoder"],
+            ["decoder", "loss"],
+        ]
+        cfg = _make_layout_config(
+            hidden_size=H,
+            num_layers=8,
+            pp_size=4,
+            layout=layout,
+            enable_hyper_connections=True,
+            num_residual_streams=N,
+        )
+        shapes = _get_send_recv_shapes(cfg, pp_size=4)
+        for i in range(3):
+            assert (
+                shapes[i][0] == shapes[i + 1][1]
+            ), f"rank {i} send {shapes[i][0]} != rank {i+1} recv {shapes[i+1][1]}"
+
+        # First stage sends n*C, intermediate stages send/recv n*C, last stage sends C
+        assert shapes[0][0] == [(32, 2, H * N)]
+        for i in (1, 2):
+            assert shapes[i][0] == [(32, 2, H * N)]
+            assert shapes[i][1] == [(32, 2, H * N)]
+        assert shapes[3][0] == [(32, 2, H)]
+        assert shapes[3][1] == [(32, 2, H * N)]
+
+    def test_pp2_flexible_vpp_no_mhc_baseline(self):
+        """Baseline: PP=2 with flexible VPP layout, no mHC — all shapes are C."""
+        H = 64
+        cfg = _make_layout_config(
+            hidden_size=H,
+            num_layers=7,
+            pp_size=2,
+            layout=[["embedding"], ["decoder"] * 6, ["decoder"], ["loss"]],
+            enable_hyper_connections=False,
+        )
+        shapes = _get_send_recv_shapes(cfg, pp_size=2)
+        for i in range(1):
+            assert shapes[i][0] == shapes[i + 1][1]
+            assert shapes[i][0] == [(32, 2, H)]
+
+    def test_pp4_flexible_vpp_mhc_uneven_layers_shape_consistent(self):
+        """Highly uneven layout: shapes must still match between stages."""
+        H, N = 64, 4
+        layout = [["embedding", "decoder"], ["decoder"] * 5, ["decoder"], ["decoder", "loss"]]
+        cfg = _make_layout_config(
+            hidden_size=H,
+            num_layers=8,
+            pp_size=2,
+            layout=layout,
+            enable_hyper_connections=True,
+            num_residual_streams=N,
+        )
+        shapes = _get_send_recv_shapes(cfg, pp_size=2)
+        assert (
+            shapes[0][0] == shapes[1][1]
+        ), f"rank 0 send {shapes[0][0]} != rank 1 recv {shapes[1][1]}"
diff --git a/tests/unit_tests/ssm/test_gated_delta_net.py b/tests/unit_tests/ssm/test_gated_delta_net.py
index ec4d7a86ecf..3eb02442fe9 100644
--- a/tests/unit_tests/ssm/test_gated_delta_net.py
+++ b/tests/unit_tests/ssm/test_gated_delta_net.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+from functools import partial
 from unittest import mock
 
 import pytest
@@ -31,6 +32,10 @@
 )
 from tests.unit_tests.test_utilities import Utils
 from tests.unit_tests.transformer.test_attention import _test_parallel_attention_correctness
+from tests.unit_tests.transformer.test_multi_latent_attention import (
+    make_test_packed_seq_params,
+    make_test_packed_seq_params_with_padding,
+)
 
 try:
     import fla
@@ -201,7 +206,149 @@ def test_jit_compiled_helpers(self):
         assert g.shape == alpha.shape
         assert beta_sig.shape == beta.shape
 
+    def test_gpu_forward_thd_correctness(self):
+        if self.sp_size > 1:
+            pytest.skip("Sequence parallel is not supported for this test case.")
+
+        atol, rtol = 3e-4, 3e-4
+
+        # Input shape
+        sequence_length = 32
+        micro_batch_size = 4
+        cu_seqlens = [0, 32, 64, 96, 128]
+        # sbhd input shape: [sequence length, batch size, hidden size]
+        sub_sequence_length = sequence_length // self.cp_size
+        hidden_states_sbhd = torch.rand(
+            (sub_sequence_length, micro_batch_size, self.gdn.config.hidden_size)
+        )
+        attention_mask_sbhd = None
+        hidden_states_sbhd = hidden_states_sbhd.cuda().bfloat16()
+        # thd input shape: [sequence length * batch size, 1, hidden size]
+        hidden_states_thd = hidden_states_sbhd.transpose(0, 1).contiguous()
+        hidden_states_thd = hidden_states_thd.view(-1, 1, self.gdn.config.hidden_size)
+        attention_mask_thd = None
+        packed_seq_params = make_test_packed_seq_params(cu_seqlens=cu_seqlens)
+
+        # THD format
+        output_thd, _ = self.gdn(
+            hidden_states_thd, attention_mask_thd, packed_seq_params=packed_seq_params
+        )
+        # SBHD format
+        output_sbhd, _ = self.gdn(hidden_states_sbhd, attention_mask_sbhd)
+        output_sbhd_T = output_sbhd.transpose(0, 1).contiguous().view(*output_thd.shape)
+
+        rank = torch.distributed.get_rank()
+        assert output_thd.shape[0] == sub_sequence_length * micro_batch_size
+        assert output_thd.shape[1] == 1
+        assert output_thd.shape[2] == self.gdn.config.hidden_size
+        torch.testing.assert_close(
+            output_sbhd_T,
+            output_thd,
+            atol=atol,
+            rtol=rtol,
+            msg=lambda msg: f"Output mismatch ({rank=}): {msg}",
+        )
+
+    def test_gpu_forward_thd_padding_correctness(self):
+        if self.sp_size > 1:
+            pytest.skip("Sequence parallel is not supported for this test case.")
+
+        atol, rtol = 3e-4, 3e-4
+        sequence_length = 32
+        micro_batch_size = 4
+
+        # sbhd input shape: [sequence length, batch size, hidden size]
+        sub_sequence_length = sequence_length // self.cp_size
+        hidden_states_sbhd = torch.rand(
+            (sub_sequence_length, micro_batch_size, self.gdn.config.hidden_size),
+            device=torch.cuda.current_device(),
+            dtype=torch.bfloat16,
+        )
+        output_sbhd, _ = self.gdn(hidden_states_sbhd, None)
+
+        # thd input shape: [sequence length * batch size, 1, hidden size]
+        hidden_states_thd = hidden_states_sbhd.transpose(0, 1).contiguous()
+        hidden_states_thd = hidden_states_thd.view(-1, 1, self.gdn.config.hidden_size)
+        output_bshd = output_sbhd.transpose(0, 1).contiguous()
+
+        rank = torch.distributed.get_rank()
+
+        # A) padded branch: prefer *_padded when available.
+        padded_params = make_test_packed_seq_params_with_padding(
+            cu_seqlens=[0, 30, 60, 90, 120], cu_seqlens_padded=[0, 32, 64, 96, 128]
+        )
+        output_thd_padded, _ = self.gdn(hidden_states_thd, None, packed_seq_params=padded_params)
+        output_thd2bshd = output_thd_padded.view(*output_bshd.shape)
+        torch.testing.assert_close(
+            output_bshd[..., :30],
+            output_thd2bshd[..., :30],
+            atol=atol,
+            rtol=rtol,
+            msg=lambda msg: f"THD padded output mismatch ({rank=}): {msg}",
+        )
+
+        # B) no-padded branch: use actual cu_seqlens when it matches total_sequence_length.
+        no_padding_params = make_test_packed_seq_params(cu_seqlens=[0, 32, 64, 96, 128])
+        output_thd_no_padding, _ = self.gdn(
+            hidden_states_thd, None, packed_seq_params=no_padding_params
+        )
+        assert output_thd_no_padding.shape == output_thd_padded.shape
+
+        # C) padded mismatch branch: if *_padded[-1] mismatches total_sequence_length, should raise.
+        padded_mismatch_params = make_test_packed_seq_params_with_padding(
+            cu_seqlens=[0, 30, 60, 90, 120], cu_seqlens_padded=[0, 32, 64, 96, 126]
+        )
+        with pytest.raises(ValueError, match="does not match"):
+            self.gdn(hidden_states_thd, None, packed_seq_params=padded_mismatch_params)
+
+        # D) actual mismatch branch without *_padded: should raise.
+        actual_mismatch_params = make_test_packed_seq_params(cu_seqlens=[0, 32, 64, 96, 129])
+        with pytest.raises(ValueError, match="does not match"):
+            self.gdn(hidden_states_thd, None, packed_seq_params=actual_mismatch_params)
+
+
+@pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.")
+@pytest.mark.internal
+class TestGDNCuSeqlensResolve:
+
+    @pytest.fixture
+    def mock_gdn(self):
+        class MockGDN:
+            cp_size = 2
+            _resolve_cu_seqlens = GatedDeltaNet._resolve_cu_seqlens
+
+        return MockGDN()
+
+    def test_padded_preferred_when_available(self, mock_gdn):
+        actual = torch.tensor([0, 500, 1000], dtype=torch.int32)
+        padded = torch.tensor([0, 504, 1008], dtype=torch.int32)
+        result = mock_gdn._resolve_cu_seqlens(padded, actual, 1008, "cu_seqlens_q")
+        assert torch.equal(result, padded)
+
+    def test_actual_used_when_no_padding(self, mock_gdn):
+        actual = torch.tensor([0, 504, 1008], dtype=torch.int32)
+        result = mock_gdn._resolve_cu_seqlens(None, actual, 1008, "cu_seqlens_q")
+        assert torch.equal(result, actual)
+
+    def test_raises_when_padding_mismatch(self, mock_gdn):
+        actual = torch.tensor([0, 500, 1000], dtype=torch.int32)
+        with pytest.raises(ValueError, match="does not match"):
+            mock_gdn._resolve_cu_seqlens(None, actual, 1008, "cu_seqlens_q")
+
+    def test_raises_when_padded_mismatches_total(self, mock_gdn):
+        actual = torch.tensor([0, 500, 1000], dtype=torch.int32)
+        padded = torch.tensor([0, 504, 1004], dtype=torch.int32)
+        with pytest.raises(ValueError, match="does not match"):
+            mock_gdn._resolve_cu_seqlens(padded, actual, 1008, "cu_seqlens_q")
+
+    def test_cp1_still_validates_total(self, mock_gdn):
+        mock_gdn.cp_size = 1
+        actual = torch.tensor([0, 500, 1000], dtype=torch.int32)
+        with pytest.raises(ValueError, match="does not match"):
+            mock_gdn._resolve_cu_seqlens(None, actual, 1008, "cu_seqlens_q")
+
 
+@pytest.mark.parametrize("sequence_packing", [False, True])
 @pytest.mark.parametrize(
     ("tp", "sp", "cp"),
     [
@@ -213,7 +360,7 @@ def test_jit_compiled_helpers(self):
     ],
 )
 @pytest.mark.skipif(not HAVE_FLA, reason="FLA is not installed.")
-def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp):
+def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, sequence_packing, tp, sp, cp):
     transformer_config = TransformerConfig(
         hidden_size=128,
         linear_conv_kernel_dim=2,
@@ -254,4 +401,5 @@ def test_parallel_gated_delta_net_correctness(tmp_path_dist_ckpt, tp, sp, cp):
         seed=123,
         sequence_length=256,
         micro_batch_size=4,
+        sequence_packing=sequence_packing,
     )
diff --git a/tests/unit_tests/test_fp8_param.py b/tests/unit_tests/test_fp8_param.py
index 34b504e21de..e0a71526297 100644
--- a/tests/unit_tests/test_fp8_param.py
+++ b/tests/unit_tests/test_fp8_param.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import contextlib
 import gc
@@ -72,12 +72,12 @@ def setup_method(self, method):
         os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
 
     def teardown_method(self, method):
-        Utils.destroy_model_parallel()
-        destroy_global_vars()
-        destroy_num_microbatches_calculator()
         if self.cuda_graph_helper is not None and self.cuda_graph_helper.graphs_created():
             self.cuda_graph_helper.delete_cuda_graphs()
             self.cuda_graph_helper = None
+        Utils.destroy_model_parallel()
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
         gc.collect()
 
     def model_provider(
diff --git a/tests/unit_tests/test_inference.py b/tests/unit_tests/test_inference.py
index 8b3a4a64da4..9474ac0475a 100644
--- a/tests/unit_tests/test_inference.py
+++ b/tests/unit_tests/test_inference.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import argparse
 import unittest.mock
diff --git a/tests/unit_tests/test_optimizer_state_offloading.py b/tests/unit_tests/test_optimizer_state_offloading.py
new file mode 100644
index 00000000000..baaab355182
--- /dev/null
+++ b/tests/unit_tests/test_optimizer_state_offloading.py
@@ -0,0 +1,337 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+"""Unit tests for OptimizerStateOffloader."""
+
+import pytest
+import torch
+import torch.nn as nn
+
+from megatron.core.distributed import DistributedDataParallel, DistributedDataParallelConfig
+from megatron.core.optimizer import OptimizerConfig, get_megatron_optimizer
+from megatron.core.transformer import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    from transformer_engine.pytorch.optimizers import FusedAdam  # noqa: F401
+
+    TE_FUSED_ADAM_AVAILABLE = True
+except ImportError:
+    TE_FUSED_ADAM_AVAILABLE = False
+
+
+class SimpleModel(nn.Module):
+    """Simple model for testing."""
+
+    def __init__(self, hidden_size=256):
+        super().__init__()
+        self.fc1 = nn.Linear(hidden_size, hidden_size)
+        self.fc2 = nn.Linear(hidden_size, hidden_size)
+
+    def forward(self, x):
+        return self.fc2(torch.relu(self.fc1(x)))
+
+
+def create_model_and_optimizer(hidden_size=256, offload_optimizer_states=True, **optimizer_kwargs):
+    """Helper to create model and optimizer for tests."""
+    model = SimpleModel(hidden_size=hidden_size).bfloat16().cuda()
+    ddp_config = DistributedDataParallelConfig(use_distributed_optimizer=True)
+    model = DistributedDataParallel(
+        TransformerConfig(num_attention_heads=1, num_layers=1), ddp_config, model
+    )
+
+    default_config = dict(
+        optimizer='adam',
+        bf16=True,
+        lr=0.001,
+        use_distributed_optimizer=True,
+        offload_optimizer_states=offload_optimizer_states,
+    )
+    default_config.update(optimizer_kwargs)
+
+    optimizer_config = OptimizerConfig(**default_config)
+    optim = get_megatron_optimizer(optimizer_config, [model])
+    return model, optim
+
+
+def run_forward_backward_step(model, optim, hidden_size=256):
+    """Run a single forward-backward-step cycle."""
+    input_tensor = torch.randn(8, hidden_size, dtype=torch.bfloat16, device='cuda')
+    output = model(input_tensor)
+    output.sum().backward()
+    optim.step()
+    optim.zero_grad()
+
+
+# =============================================================================
+# Test 1: Basic OptimizerStateOffloader Initialization
+# =============================================================================
+@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam")
+def test_offloader_initialization():
+    """Test that OptimizerStateOffloader initializes correctly."""
+    Utils.initialize_model_parallel()
+    model, optim = create_model_and_optimizer()
+    dist_optim = optim.chained_optimizers[0]
+
+    # Offloader is created in __init__ when offload_optimizer_states=True
+    assert dist_optim._state_offloader is not None
+    offloader = dist_optim._state_offloader
+
+    # Verify offloader properties
+    assert offloader.adam_optimizer is not None
+    assert offloader._d2h_stream is not None
+    assert offloader._h2d_stream is not None
+    assert offloader._offloaded is False
+
+    # Before first step, optimizer states are not initialized yet
+    assert offloader._optimizer_states_initialized is False
+
+    # Run one step to initialize optimizer states
+    run_forward_backward_step(model, optim)
+
+    # After first step, optimizer states should be marked as initialized
+    assert offloader._optimizer_states_initialized is True
+    Utils.destroy_model_parallel()
+
+
+# =============================================================================
+# Test 2: Early Master Weight Offloading Before First Step
+# =============================================================================
+@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam")
+def test_early_master_weight_offloading():
+    """Test that master weights can be offloaded before the first optimizer step."""
+    Utils.initialize_model_parallel()
+    model, optim = create_model_and_optimizer()
+    dist_optim = optim.chained_optimizers[0]
+
+    # Offloader is created in __init__
+    assert dist_optim._state_offloader is not None
+    offloader = dist_optim._state_offloader
+
+    # Before first step, optimizer states are not initialized
+    assert offloader._optimizer_states_initialized is False
+
+    # Capture original master weights before offload
+    original_master_weights = []
+    for group in dist_optim.shard_fp32_from_float16_groups:
+        group_weights = [tensor.clone() for tensor in group]
+        original_master_weights.append(group_weights)
+
+    # Offload before first step - should only offload master weights
+    offloader.offload()
+    offloader.release_gpu_memory()
+    torch.cuda.synchronize()
+
+    # Verify master weights were offloaded (storage resized to 0)
+    for group in dist_optim.shard_fp32_from_float16_groups:
+        for tensor in group:
+            assert tensor.untyped_storage().size() == 0, "Master weight should be offloaded"
+
+    # Reload master weights
+    offloader.reload()
+    offloader.sync_before_step()
+
+    # Verify master weights match after reload
+    for group_idx, group in enumerate(dist_optim.shard_fp32_from_float16_groups):
+        for param_idx, tensor in enumerate(group):
+            original = original_master_weights[group_idx][param_idx]
+            torch.testing.assert_close(
+                tensor,
+                original,
+                msg=f"Master weight [{group_idx}][{param_idx}] mismatch after offload/reload",
+            )
+
+    # Now run a step and verify optimizer states can be offloaded after
+    run_forward_backward_step(model, optim)
+    assert offloader._optimizer_states_initialized is True
+
+    Utils.destroy_model_parallel()
+
+
+# =============================================================================
+# Test 3: Offload and Reload Correctness
+# =============================================================================
+@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam")
+@pytest.mark.parametrize("offload_optimizer_states", [True, False])
+@pytest.mark.parametrize("offload_master_weights", [True, False])
+def test_offload_reload_correctness(offload_optimizer_states, offload_master_weights):
+    """Test that offload/reload preserves optimizer state values."""
+    if not offload_optimizer_states and not offload_master_weights:
+        pytest.skip("At least one offload type required")
+
+    Utils.initialize_model_parallel()
+    model, optim = create_model_and_optimizer()
+    dist_optim = optim.chained_optimizers[0]
+
+    # Run steps to build up optimizer state
+    for _ in range(3):
+        run_forward_backward_step(model, optim)
+
+    offloader = dist_optim._state_offloader
+
+    # Capture original states before offload
+    original_states = {}
+    for param, state in offloader.adam_optimizer.state.items():
+        original_states[param] = {
+            k: v.clone() for k, v in state.items() if isinstance(v, torch.Tensor)
+        }
+
+    # Offload
+    offloader.offload(
+        offload_optimizer_states=offload_optimizer_states,
+        offload_master_weights=offload_master_weights,
+    )
+
+    # Release GPU memory
+    offloader.release_gpu_memory()
+    torch.cuda.synchronize()
+
+    # Reload
+    offloader.reload()
+    offloader.sync_before_step()
+
+    # Verify states match after reload
+    for param, state in offloader.adam_optimizer.state.items():
+        if param in original_states:
+            for key, original_tensor in original_states[param].items():
+                if key in state and isinstance(state[key], torch.Tensor):
+                    reloaded_tensor = state[key]
+                    assert reloaded_tensor.device.type == 'cuda', f"State {key} should be on GPU"
+                    torch.testing.assert_close(
+                        reloaded_tensor,
+                        original_tensor,
+                        msg=f"State {key} mismatch after offload/reload",
+                    )
+    Utils.destroy_model_parallel()
+
+
+# =============================================================================
+# Test 4: GPU Memory Release Verification
+# =============================================================================
+@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam")
+def test_gpu_memory_release():
+    """Test that GPU memory is actually freed after release_gpu_memory()."""
+    Utils.initialize_model_parallel()
+    # Use larger model for measurable memory impact
+    model, optim = create_model_and_optimizer(hidden_size=1024)
+    dist_optim = optim.chained_optimizers[0]
+
+    # Initialize optimizer states
+    run_forward_backward_step(model, optim, hidden_size=1024)
+
+    offloader = dist_optim._state_offloader
+
+    # Measure memory before offload
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+    memory_before = torch.cuda.memory_allocated()
+
+    # Offload and release
+    offloader.offload()
+    offloader.release_gpu_memory()
+
+    # Wait for async operations
+    torch.cuda.synchronize()
+    torch.cuda.empty_cache()
+    memory_after = torch.cuda.memory_allocated()
+
+    # Memory should decrease
+    memory_freed = memory_before - memory_after
+    assert memory_freed > 0, f"Expected memory to be freed, but got {memory_freed} bytes difference"
+    Utils.destroy_model_parallel()
+
+
+# =============================================================================
+# Test 5: Multiple Offload/Reload Cycles
+# =============================================================================
+@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam")
+def test_multiple_offload_reload_cycles():
+    """Test that multiple offload/reload cycles work correctly."""
+    Utils.initialize_model_parallel()
+    model, optim = create_model_and_optimizer()
+    dist_optim = optim.chained_optimizers[0]
+
+    # Initialize
+    run_forward_backward_step(model, optim)
+
+    offloader = dist_optim._state_offloader
+
+    # Run multiple cycles
+    for cycle in range(5):
+        # Offload
+        offloader.offload()
+        offloader.release_gpu_memory()
+
+        # Reload
+        offloader.reload()
+        offloader.sync_before_step()
+
+        # Run optimizer step
+        run_forward_backward_step(model, optim)
+
+    # Verify model can still produce valid outputs
+    input_tensor = torch.randn(8, 256, dtype=torch.bfloat16, device='cuda')
+    output = model(input_tensor)
+    assert not output.isnan().any(), "Model output contains NaN after multiple cycles"
+    Utils.destroy_model_parallel()
+
+
+# =============================================================================
+# Test 6: Training Correctness with Offloading
+# =============================================================================
+@pytest.mark.skipif(not TE_FUSED_ADAM_AVAILABLE, reason="Requires TE FusedAdam")
+def test_training_correctness_with_offloading():
+    """Test that training with offloading produces same results as without."""
+    Utils.initialize_model_parallel()
+    torch.manual_seed(42)
+
+    # Model 1: with offloading
+    model1, optim1 = create_model_and_optimizer(offload_optimizer_states=True, lr=0.01)
+
+    # Model 2: without offloading (reference)
+    torch.manual_seed(42)
+    model2, optim2 = create_model_and_optimizer(offload_optimizer_states=False, lr=0.01)
+
+    # Train both models
+    n_steps = 10
+    torch.manual_seed(123)
+    dist_optim1 = optim1.chained_optimizers[0]
+
+    # Offloader is created in __init__ when offload_optimizer_states=True
+    assert dist_optim1._state_offloader is not None
+    offloader = dist_optim1._state_offloader
+
+    for step in range(n_steps):
+        input_tensor = torch.randn(8, 256, dtype=torch.bfloat16, device='cuda')
+
+        # Model 1 with offloading
+        # Offload states (master weights can be offloaded from the start,
+        # optimizer states will be skipped until after first step)
+        offloader.offload()
+        offloader.release_gpu_memory()
+
+        output1 = model1(input_tensor)
+        loss1 = output1.sum()
+        loss1.backward()
+
+        offloader.reload()
+        offloader.sync_before_step()
+        optim1.step()
+        optim1.zero_grad()
+
+        # Model 2 without offloading
+        output2 = model2(input_tensor)
+        loss2 = output2.sum()
+        loss2.backward()
+        optim2.step()
+        optim2.zero_grad()
+
+    # Compare final model weights
+    for (n1, p1), (n2, p2) in zip(model1.named_parameters(), model2.named_parameters()):
+        torch.testing.assert_close(
+            p1.data,
+            p2.data,
+            atol=1e-5,
+            rtol=1e-4,
+            msg=f"Parameter {n1} mismatch between offloaded and non-offloaded training",
+        )
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/test_parallel_state.py b/tests/unit_tests/test_parallel_state.py
index 65b0d5ca91a..64c3a5b36a6 100644
--- a/tests/unit_tests/test_parallel_state.py
+++ b/tests/unit_tests/test_parallel_state.py
@@ -508,9 +508,9 @@ def golden_rank_result_from_past_code(
     "world_size, tp_size, cp_size, dp_size",
     [(8, 1, 2, 4), (8, 1, 1, 8)],  # 8 GPUs, 1 TP, 2 CP, 4 DP  # 8 GPUs, 1 TP, 1 CP, 8 DP
 )
-def test_hybrid_dp_cp_groups(world_size, tp_size, cp_size, dp_size):
+def test_dynamic_dp_cp_groups(world_size, tp_size, cp_size, dp_size):
     """
-    Test that hybrid DPxCP groups are created correctly.
+    Test that dynamic DPxCP groups are created correctly.
     """
     Utils.destroy_model_parallel()
 
@@ -521,13 +521,13 @@ def test_hybrid_dp_cp_groups(world_size, tp_size, cp_size, dp_size):
     Utils.initialize_model_parallel(
         tensor_model_parallel_size=tp_size,
         context_parallel_size=cp_size,
-        hybrid_context_parallel=True,
+        dynamic_context_parallel=True,
     )
 
     dp_cp_size = ps.get_data_parallel_world_size(with_context_parallel=True)
-    group_sizes = [2**i for i in range(int(log2(dp_cp_size)))][1:]
+    group_sizes = [2**i for i in range(int(log2(dp_cp_size)))]
     for group_size in group_sizes:
-        group = ps.get_hybrid_data_context_parallel_groups(group_size=group_size)
+        group = ps.get_dynamic_data_context_parallel_groups(group_size=group_size)
         assert group.size() == group_size
 
     Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/test_sequence_packing.py b/tests/unit_tests/test_sequence_packing.py
new file mode 100644
index 00000000000..d594b46b373
--- /dev/null
+++ b/tests/unit_tests/test_sequence_packing.py
@@ -0,0 +1,565 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import random
+from types import SimpleNamespace
+
+import numpy as np
+import pytest
+import torch
+
+from megatron.core import parallel_state
+from megatron.core.datasets.data_schedule import (
+    get_batch_on_this_rank_for_sequence_packing,
+    wrap_data_iterator,
+)
+from megatron.core.rerun_state_machine import RerunDataIterator
+from megatron.training.global_vars import unset_global_variables
+from tests.unit_tests.test_utilities import Utils
+
+
+class MockVariableLengthSequencePackingDataIterator:
+    """
+    Mock data iterator for testing get_batch_on_this_rank_for_sequence_packing.
+
+    Generates variable-length (THD format) packed sequences with deterministic
+    data for verification across parallel ranks.
+    """
+
+    def __init__(
+        self,
+        total_seq_length: int,
+        sequence_lengths: list,
+        local_cp_size: int = None,
+        device: str = "cuda",
+        seed: int = 42,
+    ):
+        """
+        Args:
+            total_seq_length: Total length of packed sequences
+            sequence_lengths: List of individual sequence lengths (variable-length).
+                              If None, generates random variable lengths.
+            device: Device to create tensors on
+            seed: Random seed for reproducibility
+        """
+        self.total_seq_length = total_seq_length
+        self.sequence_lengths = sequence_lengths
+        self.local_cp_size = local_cp_size
+        self.device = device
+        self.seed = seed
+        assert (
+            sum(self.sequence_lengths) == total_seq_length
+        ), f"Sequence lengths sum {sum(self.sequence_lengths)} != total {total_seq_length}"
+
+    def __iter__(self):
+        """Interface for the data iterator."""
+        return self
+
+    def __next__(self):
+        """Generate a mock batch with variable-length THD format."""
+        dev = self.device
+        torch.manual_seed(self.seed)
+        torch.cuda.manual_seed(self.seed)
+
+        tokens = torch.randint(0, 16384, (self.total_seq_length,), dtype=torch.int64, device=dev)
+
+        # Create position_ids that reset for each sequence (THD format)
+        position_ids = []
+        for seq_len in self.sequence_lengths:
+            position_ids.extend(range(seq_len))
+        position_ids = torch.tensor(position_ids, dtype=torch.int64, device=dev)
+
+        # Labels are tokens shifted by 1 for easy verification
+        labels = tokens + 1
+
+        # Loss mask: 1.0 for all positions except padding (none here)
+        loss_mask = torch.ones(self.total_seq_length, dtype=torch.float32, device=dev)
+
+        # Create cu_seqlens for variable-length packed sequences
+        cu_seqlens = [0]
+        for seq_len in self.sequence_lengths:
+            cu_seqlens.append(cu_seqlens[-1] + seq_len)
+        cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32, device=dev)
+        cu_seqlens_padded = cu_seqlens.clone()
+
+        max_seqlen = torch.tensor([max(self.sequence_lengths)], dtype=torch.int32, device=dev)
+
+        batch = {
+            "tokens": tokens,
+            "position_ids": position_ids,
+            "labels": labels,
+            "loss_mask": loss_mask,
+            "cu_seqlens": cu_seqlens,
+            "cu_seqlens_padded": cu_seqlens_padded,
+            "max_seqlen": max_seqlen,
+        }
+
+        if not (
+            parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+            or parallel_state.is_pipeline_last_stage(ignore_virtual=True)
+        ):
+            batch["tokens"] = None
+            batch["position_ids"] = None
+            batch["labels"] = None
+            batch["loss_mask"] = None
+
+        if self.local_cp_size is not None:
+            batch["local_cp_size"] = torch.tensor(
+                [self.local_cp_size], dtype=torch.int32, device=dev
+            )
+
+        return batch
+
+
+def _gather_tensor_from_tp_group(tensor):
+    """Gather tensors from all TP ranks for comparison."""
+    assert tensor is not None, "Tensor should not be None"
+    if type(tensor) is int:
+        tensor = torch.tensor(tensor, dtype=torch.int32, device=torch.cuda.current_device())
+    tp_size = parallel_state.get_tensor_model_parallel_world_size()
+    gathered = [torch.zeros_like(tensor) for _ in range(tp_size)]
+    torch.distributed.all_gather(
+        gathered, tensor, group=parallel_state.get_tensor_model_parallel_group()
+    )
+    return gathered
+
+
+def _gather_tensor_from_all_ranks(tensor):
+    """Gather tensors from all PP ranks for comparison."""
+    assert tensor is not None, "Tensor should not be None"
+    if type(tensor) is int:
+        tensor = torch.tensor(tensor, dtype=torch.int32, device=torch.cuda.current_device())
+    gathered = [torch.zeros_like(tensor) for _ in range(torch.distributed.get_world_size())]
+    torch.distributed.all_gather(gathered, tensor)
+    return gathered
+
+
+@pytest.mark.parametrize(
+    ("tp", "pp", "cp", "dynamic_cp", "local_cp_size"),
+    [
+        (1, 1, 1, False, None),  # Basic case: no parallelism
+        (2, 1, 1, False, None),  # Tensor parallel only
+        (1, 2, 1, False, None),  # Pipeline parallel only
+        (2, 2, 1, False, None),  # TP + PP
+        (1, 1, 2, False, None),  # CP only
+        (2, 1, 2, False, None),  # TP + CP
+        (1, 2, 2, False, None),  # PP + CP
+        (1, 4, 1, False, None),  # Has middle pp stage
+        (1, 1, 4, True, 4),  # DCP: all CP ranks participate
+        (1, 1, 4, True, 2),  # DCP: partial CP (2 out of 4)
+        (1, 1, 4, True, 1),  # DCP: no CP splitting
+        (2, 1, 4, True, 4),  # DCP + TP
+        (1, 2, 4, True, 4),  # DCP + PP
+    ],
+)
+def test_get_batch_on_this_rank_for_sequence_packing(tp, pp, cp, dynamic_cp, local_cp_size):
+    """
+    Test get_batch_on_this_rank_for_sequence_packing function with variable-length THD format.
+
+    This test verifies:
+    1. TP ranks: All ranks within a TP group receive identical data after broadcast
+    2. PP ranks: Middle PP ranks have the same packed_seq_params as first/last stages
+    3. CP ranks: Data is correctly partitioned with proper shape and values
+    4. Variable-length (THD) format: Different sequence lengths are handled correctly
+    """
+    args = SimpleNamespace()
+    args.tensor_model_parallel_size = tp
+    args.pipeline_model_parallel_size = pp
+    args.context_parallel_size = cp
+    args.virtual_pipeline_model_parallel_size = None
+    args.data_parallel_size = 8 // (tp * pp * cp)
+    args.seq_length = 8192
+
+    # Skip invalid configurations
+    if args.data_parallel_size < 1:
+        raise ValueError(f"Invalid config: tp={tp}, pp={pp}, cp={cp} exceeds world size 8")
+
+    # Initialize model parallel
+    init_kwargs = dict(context_parallel_size=cp)
+    if dynamic_cp:
+        init_kwargs['dynamic_context_parallel'] = True
+        init_kwargs['min_dynamic_context_parallel_size'] = 1
+    Utils.initialize_model_parallel(tp, pp, None, **init_kwargs)
+
+    try:
+        # Create mock data iterator with variable-length sequences
+        # Only TP rank 0 needs the iterator; other TP ranks pass None
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        if tp_rank == 0:
+            # Use deterministic seed based on DP rank so same data within TP/PP/CP group
+            dp_rank = parallel_state.get_data_parallel_rank()
+            sequence_lengths = [1024, 2048, 512, 1536, 3072]
+            assert (
+                sum(sequence_lengths) == args.seq_length
+            ), f"Sequence lengths sum {sum(sequence_lengths)} != total {args.seq_length}"
+            data_iterator = iter(
+                MockVariableLengthSequencePackingDataIterator(
+                    total_seq_length=args.seq_length,
+                    sequence_lengths=sequence_lengths,
+                    local_cp_size=local_cp_size,
+                    seed=42 + dp_rank,
+                )
+            )
+        else:
+            data_iterator = None
+
+        # Call the function under test
+        result = get_batch_on_this_rank_for_sequence_packing(
+            data_iterator=data_iterator,
+            mtp_on_this_rank=False,
+            vp_stage=None,
+            dynamic_cp=dynamic_cp,
+        )
+
+        # Unpack the result
+        tokens, labels, loss_mask, attention_mask, position_ids, packed_seq_params = result
+
+        # Get parallel state info
+        tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        is_first_stage = parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+        is_last_stage = parallel_state.is_pipeline_last_stage(ignore_virtual=True)
+        is_first_or_last = is_first_stage or is_last_stage
+
+        # =====================================================================
+        # TEST 1: Verify data based on pipeline stage
+        # =====================================================================
+        if is_first_stage:
+            assert tokens is not None, "First stage should have tokens"
+            assert position_ids is not None, "First stage should have position_ids"
+            assert tokens.dim() == 2, "Tokens should be 2D (batch, seq)"
+            assert position_ids.dim() == 2, "Position IDs should be 2D (batch, seq)"
+            assert tokens.size(0) == 1, "batch should be 1 in THD format"
+            assert position_ids.size(0) == 1, "batch should be 1 in THD format"
+        else:
+            assert tokens is None, "Non-first stage should not have tokens"
+            assert position_ids is None, "Non-first stage should not have position_ids"
+
+        if is_last_stage:
+            assert labels is not None, "Last stage should have labels"
+            assert loss_mask is not None, "Last stage should have loss_mask"
+            assert labels.dim() == 2, "Labels should be 2D (batch, seq)"
+            assert loss_mask.dim() == 2, "Loss mask should be 2D (batch, seq)"
+            assert labels.size(0) == 1, "batch should be 1 in THD format"
+            assert loss_mask.size(0) == 1, "batch should be 1 in THD format"
+        else:
+            assert labels is None, "Non-last stage should not have labels"
+            assert loss_mask is None, "Non-last stage should not have loss_mask"
+
+        # =====================================================================
+        # TEST 2: Verify packed_seq_params consistency
+        # =====================================================================
+        assert packed_seq_params is not None
+        assert packed_seq_params.qkv_format == "thd"
+
+        test_keys = [
+            "cu_seqlens_q",
+            "cu_seqlens_q_padded",
+            "max_seqlen_q",
+            "cu_seqlens_kv",
+            "cu_seqlens_kv_padded",
+            "max_seqlen_kv",
+        ]
+
+        if dynamic_cp:
+            assert packed_seq_params.local_cp_size == local_cp_size
+            # For DCP, only TP ranks within the same CP group should match.
+            # Different CP groups can have different packed_seq_params.
+            if tp > 1:
+                for key in test_keys:
+                    tensor = getattr(packed_seq_params, key)
+                    assert tensor is not None
+                    gathered = _gather_tensor_from_tp_group(tensor)
+                    for i in range(1, tp):
+                        assert torch.equal(
+                            gathered[0], gathered[i]
+                        ), f"TP rank 0 and rank {i} have different {key}"
+        else:
+            # For THD, all ranks share the same packing metadata.
+            for key in test_keys:
+                tensor = getattr(packed_seq_params, key)
+                assert tensor is not None
+                gathered_tensor = _gather_tensor_from_all_ranks(tensor)
+                for i in range(1, len(gathered_tensor)):
+                    assert torch.equal(
+                        gathered_tensor[0], gathered_tensor[i]
+                    ), f"Rank 0 and rank {i} have different {key}"
+
+        # =====================================================================
+        # TEST 3: Verify TP ranks receive identical data after broadcast
+        # =====================================================================
+        if tp > 1:
+            test_tensors = []
+            if is_first_stage:
+                test_tensors.extend([tokens, position_ids])
+            if is_last_stage:
+                test_tensors.extend([labels, loss_mask])
+
+            for tensor in test_tensors:
+                gathered_tensors = _gather_tensor_from_tp_group(tensor)
+                for i in range(1, tp):
+                    assert torch.equal(
+                        gathered_tensors[0], gathered_tensors[i]
+                    ), f"TP rank 0 and rank {i} have different data"
+
+        # =====================================================================
+        # TEST 4: Verify CP partitioning
+        # =====================================================================
+        effective_cp = local_cp_size if dynamic_cp else cp
+        if effective_cp is not None and effective_cp > 1:
+            expected_seq_len = args.seq_length // effective_cp
+
+            if is_first_stage:
+                actual_seq_len = tokens.shape[1]
+                assert (
+                    actual_seq_len == expected_seq_len
+                ), f"CP partitioned tokens have wrong shape: {actual_seq_len} != {expected_seq_len}"
+
+            if is_last_stage:
+                actual_seq_len = labels.shape[1]
+                assert (
+                    actual_seq_len == expected_seq_len
+                ), f"CP partitioned labels have wrong shape: {actual_seq_len} != {expected_seq_len}"
+
+    finally:
+        Utils.destroy_model_parallel()
+        unset_global_variables()
+
+
+@pytest.mark.parametrize(
+    ("tp", "pp", "cp", "vpp", "scheduler_type"),
+    [
+        (1, 1, 8, None, "dp_balanced"),
+        (2, 1, 4, None, "dp_balanced"),
+        (2, 4, 1, None, "dp_balanced"),
+        (2, 2, 1, None, "dp_balanced"),
+        (1, 4, 1, 4, "dp_balanced"),
+        (1, 1, 8, None, "default_dynamic_cp"),
+        (2, 1, 4, None, "default_dynamic_cp"),
+        (1, 2, 4, None, "default_dynamic_cp"),
+        (1, 4, 2, 4, "default_dynamic_cp"),
+    ],
+)
+def test_wrap_dataloader(tp, pp, cp, vpp, scheduler_type):
+    '''
+    Test wrap_dataloader function with different scheduler types.
+    '''
+    is_dynamic_cp = scheduler_type == "default_dynamic_cp"
+
+    args = SimpleNamespace()
+    args.tensor_model_parallel_size = tp
+    args.pipeline_model_parallel_size = pp
+    args.context_parallel_size = cp
+    args.virtual_pipeline_model_parallel_size = None
+    args.data_parallel_size = 8 // (tp * pp * cp)
+    args.seq_length = 8192
+    args.max_seqlen_per_dp_cp_rank = 8192
+
+    # Skip invalid configurations
+    if args.data_parallel_size < 1:
+        raise ValueError(f"Invalid config: tp={tp}, pp={pp}, cp={cp} exceeds world size 8")
+
+    def _create_single_sample(seq_len):
+        # hard code the padding size to 16
+        pad_size = 16
+        seq_len_padded = ((seq_len + pad_size - 1) // pad_size) * pad_size
+        device = torch.device("cuda", torch.cuda.current_device())
+        tokens = torch.randint(0, 128, (seq_len_padded,), dtype=torch.int64, device=device)
+        labels = tokens + 1
+        position_ids = torch.arange(seq_len_padded, dtype=torch.int64, device=device)
+        loss_mask = torch.ones(seq_len_padded, dtype=torch.float32, device=device)
+        loss_mask[0:seq_len] = 1
+        loss_mask[seq_len:] = 0
+        cu_seqlens = torch.tensor([0, seq_len_padded], dtype=torch.int32, device=device)
+
+        return {
+            'tokens': tokens,
+            'labels': labels,
+            'loss_mask': loss_mask,
+            'position_ids': position_ids,
+            'cu_seqlens': cu_seqlens,
+        }
+
+    # Initialize model parallel
+    init_kwargs = dict(context_parallel_size=cp)
+    if is_dynamic_cp:
+        init_kwargs['dynamic_context_parallel'] = True
+        init_kwargs['min_dynamic_context_parallel_size'] = 1
+    Utils.initialize_model_parallel(tp, pp, vpp, **init_kwargs)
+
+    global_batch_size = 64
+    micro_batch_size = 1
+    rng = random.Random(42)
+    nums = [rng.randint(2048, args.seq_length) for _ in range(global_batch_size)]  # 64 sequences
+
+    config = SimpleNamespace()
+    config.max_seqlen_per_dp_cp_rank = args.max_seqlen_per_dp_cp_rank
+    config.microbatch_group_size_per_vp_stage = pp
+    config.virtual_pipeline_model_parallel_size = vpp
+    config.sequence_packing_scheduler = scheduler_type
+    if is_dynamic_cp:
+        config.min_dynamic_context_parallel_size = 1
+
+    dp_rank = parallel_state.get_data_parallel_rank()
+    dp_size = parallel_state.get_data_parallel_world_size()
+
+    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+    tp_rank = parallel_state.get_tensor_model_parallel_rank()
+
+    is_pp_first = pp_rank == 0
+    is_pp_last = pp_rank == pp - 1
+    is_pp_first_or_last = is_pp_first or is_pp_last
+    is_tp_first = tp_rank == 0
+
+    num_micro_batches_old = global_batch_size // micro_batch_size // dp_size
+
+    if is_tp_first and (is_pp_first or is_pp_last):
+        # Seed torch RNG so CP siblings produce identical token values
+        torch.manual_seed(42 + dp_rank)
+        torch.cuda.manual_seed(42 + dp_rank)
+        samples = [
+            _create_single_sample(num)
+            for num in nums[dp_rank * num_micro_batches_old : (dp_rank + 1) * num_micro_batches_old]
+        ]
+        data_iterator = RerunDataIterator(iter(samples))
+    else:
+        data_iterator = None
+
+    if is_tp_first:
+        if vpp is not None and vpp > 1:
+            if is_pp_first:
+                data_iterator = [data_iterator] + [None for _ in range(vpp - 1)]
+            elif is_pp_last:
+                data_iterator = [None for _ in range(vpp - 1)] + [data_iterator]
+            else:
+                data_iterator = [None for _ in range(vpp)]
+    try:
+        # Call the function under test
+        (
+            new_data_iterator,
+            num_micro_batches,
+            num_total_tokens_this_global_batch,
+            sequence_square_sum_this_global_batch,
+        ) = wrap_data_iterator(data_iterator, config, num_micro_batches_old)
+
+        # check the result
+        assert type(num_micro_batches) is int
+        assert (
+            type(num_total_tokens_this_global_batch) is float
+            or type(num_total_tokens_this_global_batch) is np.float32
+        )
+        assert (
+            type(sequence_square_sum_this_global_batch) is float
+            or type(sequence_square_sum_this_global_batch) is np.float32
+        )
+
+        def _check_batch(batch_all, batch_keys):
+            for batch in batch_all:
+                assert set(batch_keys) <= set(
+                    batch.keys()
+                ), f"batch keys: {set(batch.keys())} missing {set(batch_keys) - set(batch.keys())}"
+                for key in batch_keys:
+                    assert batch[key] is not None
+
+        if is_tp_first:
+            # CHECK KEYS
+            batch_keys = ["cu_seqlens", "max_seqlen", "cu_seqlens_padded"]
+            if is_dynamic_cp:
+                batch_keys.append("local_cp_size")
+            if vpp is not None and vpp > 1:
+                # check metadata for all stages (save batches to avoid re-consuming iterators)
+                all_stage_batches = []
+                for temp_data_iterator in new_data_iterator:
+                    stage_batch = [next(temp_data_iterator) for _ in range(num_micro_batches)]
+                    all_stage_batches.append(stage_batch)
+                    _check_batch(stage_batch, batch_keys)
+
+                # check for first or last stage on first or last pp rank
+                if is_pp_first_or_last:
+                    batch_all = all_stage_batches[0] if is_pp_first else all_stage_batches[-1]
+                    batch_keys += ["tokens", "position_ids", "labels", "loss_mask"]
+                    _check_batch(batch_all, batch_keys)
+            else:
+                # non-VPP: single iterator
+                batch_all = [next(new_data_iterator) for _ in range(num_micro_batches)]
+                if is_pp_first_or_last:
+                    batch_keys += ["tokens", "position_ids", "labels", "loss_mask"]
+                _check_batch(batch_all, batch_keys)
+
+            # CHECK TOKEN SUM ON FIRST OR LAST PP RANK
+            # Note: data_iterator is consumed by wrap_data_iterator, new_data_iterator is consumed above.
+            # Use `samples` for before-wrap, reuse `batch_all` from the check above for after-wrap.
+            # Skip for VPP: microbatch alignment may pad/duplicate samples,
+            # changing the total token count.
+            if is_pp_first_or_last and (vpp is None or vpp <= 1):
+                dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+                cp_size = parallel_state.get_context_parallel_world_size()
+                cp_group = parallel_state.get_context_parallel_group()
+
+                # Count each sequence exactly once using int64 for bitwise comparison.
+                # THD (dp_balanced): CP siblings hold identical packed data,
+                #   so reduce across DP only (not CP) on both sides.
+                # DCP: data is redistributed uniquely across dp_cp ranks,
+                #   so per-microbatch CP all_reduce + scale, then dp_cp all_reduce.
+                # Both sides multiply by max_cp so DCP (with varying local_cp)
+                # can be normalized to the same integer scale without division.
+                max_cp = cp_size
+                dp_group = parallel_state.get_data_parallel_group()
+
+                # Before wrap: CP siblings have identical samples.
+                # Reduce across DP only to count each sequence once.
+                token_sum_before = torch.tensor(0, dtype=torch.int64, device='cuda')
+                for sample in samples:
+                    token_sum_before += sample['tokens'].long().sum()
+                torch.distributed.all_reduce(
+                    token_sum_before, op=torch.distributed.ReduceOp.SUM, group=dp_group
+                )
+                token_sum_before *= max_cp
+
+                # After wrap.
+                token_sum_after = torch.tensor(0, dtype=torch.int64, device='cuda')
+                if is_dynamic_cp:
+                    # DCP: per-microbatch CP all_reduce + scale to max_cp,
+                    # then dp_cp all_reduce to aggregate unique contributions.
+                    for batch in batch_all:
+                        mb_sum = batch['tokens'].long().sum().clone()
+                        local_cp = batch['local_cp_size']
+                        if isinstance(local_cp, torch.Tensor):
+                            local_cp = local_cp.item()
+                        mb_cp_group = parallel_state.get_dynamic_data_context_parallel_groups(
+                            group_size=local_cp
+                        )
+                        torch.distributed.all_reduce(
+                            mb_sum, op=torch.distributed.ReduceOp.SUM, group=mb_cp_group
+                        )
+                        # all_reduce result = mb_sum * local_cp.
+                        # Scale to mb_sum * max_cp.
+                        mb_sum *= max_cp // local_cp
+                        token_sum_after += mb_sum
+                    torch.distributed.all_reduce(
+                        token_sum_after, op=torch.distributed.ReduceOp.SUM, group=dp_cp_group
+                    )
+                else:
+                    # THD: CP siblings hold identical packed data.
+                    # Reduce across DP only (same as before).
+                    for batch in batch_all:
+                        token_sum_after += batch['tokens'].long().sum()
+                    torch.distributed.all_reduce(
+                        token_sum_after, op=torch.distributed.ReduceOp.SUM, group=dp_group
+                    )
+                    token_sum_after *= max_cp
+
+                assert (
+                    token_sum_before == token_sum_after
+                ), f"Token sum mismatch: before={token_sum_before.item()}, after={token_sum_after.item()}"
+
+        else:
+            if vpp is not None and vpp > 1:
+                assert type(new_data_iterator) is list and len(new_data_iterator) == vpp
+                for data_iterator in new_data_iterator:
+                    assert data_iterator is None
+            else:
+                assert new_data_iterator is None
+
+    finally:
+        Utils.destroy_model_parallel()
+        unset_global_variables()
diff --git a/tests/unit_tests/test_utils.py b/tests/unit_tests/test_utils.py
index dc554612811..95756101e74 100644
--- a/tests/unit_tests/test_utils.py
+++ b/tests/unit_tests/test_utils.py
@@ -4,9 +4,9 @@
 import time
 import urllib.request as req
 from types import SimpleNamespace
+from unittest import mock
 from unittest.mock import patch
 
-import mock
 import numpy as np
 import pytest
 import torch
diff --git a/tests/unit_tests/transformer/experimental_attention_variant/test_absorbed_mla.py b/tests/unit_tests/transformer/experimental_attention_variant/test_absorbed_mla.py
index eb235501ad7..89061ad4219 100644
--- a/tests/unit_tests/transformer/experimental_attention_variant/test_absorbed_mla.py
+++ b/tests/unit_tests/transformer/experimental_attention_variant/test_absorbed_mla.py
@@ -121,7 +121,10 @@ def _forward_thd(self, q, k, v, packed_seq_params):
 
 
 def get_mock_mla_config(
-    tensor_model_parallel_size: int, context_parallel_size: int
+    tensor_model_parallel_size: int,
+    context_parallel_size: int,
+    sequence_parallel: bool,
+    recompute_mla_up_proj: bool,
 ) -> MLATransformerConfig:
     """Create test config with all attributes used in MLA."""
     return MLATransformerConfig(
@@ -141,7 +144,7 @@ def get_mock_mla_config(
         layernorm_zero_centered_gamma=False,
         expert_model_parallel_size=1,
         tensor_model_parallel_size=tensor_model_parallel_size,
-        sequence_parallel=tensor_model_parallel_size > 1,
+        sequence_parallel=tensor_model_parallel_size > 1 and sequence_parallel,
         context_parallel_size=context_parallel_size,
         apply_rope_fusion=False,
         rope_type="yarn",
@@ -153,7 +156,8 @@ def get_mock_mla_config(
         beta_fast=32,
         beta_slow=1,
         rotary_interleaved=False,
-        recompute_granularity=None,
+        recompute_granularity="selective" if recompute_mla_up_proj else None,
+        recompute_modules=["mla_up_proj"] if recompute_mla_up_proj else [],
         fine_grained_activation_offloading=False,
         gradient_accumulation_fusion=False,
         fp8=False,
@@ -227,19 +231,35 @@ def get_mla_submodules(
     )
 
 
-@pytest.mark.parametrize("tp_cp", [[1, 1], [2, 1], [1, 2], [2, 2]])
+# TODO: Consider using get_gpt_layer_with_transformer_engine_spec from
+#       megatron.core.models.gpt.gpt_layer_specs to simplify submodule setup and cover real specs.
+# TODO: Add test case to cover TP > 1 but SP = False.
+
+
+@pytest.mark.parametrize("tp_cp_sp", [[1, 1, False], [2, 1, True], [1, 2, False], [2, 2, True]])
 @pytest.mark.parametrize("qkv_format", ['sbhd', 'thd'])
 @pytest.mark.parametrize("down_proj_use_column_parallel", [False, True])
-def test_functionality(tp_cp: List[int], qkv_format: str, down_proj_use_column_parallel: bool):
+@pytest.mark.parametrize("recompute_mla_up_proj", [False, True])
+def test_functionality(
+    tp_cp_sp: List,
+    qkv_format: str,
+    down_proj_use_column_parallel: bool,
+    recompute_mla_up_proj: bool,
+):
     """Test that AbsorbedMLASelfAttention is equivalent to standard MLA."""
-    tp_size, cp_size = tp_cp
+    tp_size, cp_size, sp = tp_cp_sp
     Utils.initialize_model_parallel(
         tensor_model_parallel_size=tp_size, context_parallel_size=cp_size
     )
     model_parallel_cuda_manual_seed(123)
 
     # Create model
-    config = get_mock_mla_config(tensor_model_parallel_size=tp_size, context_parallel_size=cp_size)
+    config = get_mock_mla_config(
+        tensor_model_parallel_size=tp_size,
+        context_parallel_size=cp_size,
+        sequence_parallel=sp,
+        recompute_mla_up_proj=recompute_mla_up_proj,
+    )
     absorbed_submodules = get_absorbed_mla_submodules(
         down_proj_use_column_parallel=down_proj_use_column_parallel,
         qk_layernorm=True,
@@ -295,13 +315,15 @@ def test_functionality(tp_cp: List[int], qkv_format: str, down_proj_use_column_p
             qkv_format='thd',
         )
         hidden_states = torch.randn(
-            (total_tokens // tp_size // cp_size, 1, config.hidden_size),
+            (total_tokens // cp_size // (tp_size if sp else 1), 1, config.hidden_size),
             dtype=torch.bfloat16,
             device='cuda',
         )
         grads = torch.randn_like(hidden_states)
     else:
-        seqlen = 1024 // tp_size // cp_size
+        # When SP is enabled, sequence is sharded across TP ranks
+        # When SP is disabled, each TP rank has the full sequence
+        seqlen = 1024 // cp_size // (tp_size if sp else 1)
         hidden_states = torch.randn((seqlen, 3, 7168), dtype=torch.bfloat16, device='cuda')
         grads = torch.randn_like(hidden_states)
         packed_seq_params = None
diff --git a/tests/unit_tests/transformer/experimental_attention_variant/test_attention_variant_csa.py b/tests/unit_tests/transformer/experimental_attention_variant/test_attention_variant_csa.py
new file mode 100644
index 00000000000..83c153d698e
--- /dev/null
+++ b/tests/unit_tests/transformer/experimental_attention_variant/test_attention_variant_csa.py
@@ -0,0 +1,872 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.experimental_attention_variant.csa import (
+    CompressedSparseAttention,
+    CompressedSparseAttentionSubmodules,
+    Compressor,
+    CompressorSubmodules,
+    CSAIndexer,
+    CSAIndexerSubmodules,
+    get_compress_topk_idxs,
+    get_window_topk_idxs,
+    unfused_compressed_sparse_attn,
+)
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    from fast_hadamard_transform import hadamard_transform as _hadamard_transform
+
+    HAVE_HADAMARD = True
+except ImportError:
+    HAVE_HADAMARD = False
+    _hadamard_transform = None
+
+
+def mock_hadamard_transform(x: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+    """Mock implementation of hadamard_transform for testing without the library installed."""
+    return x * scale
+
+
+@pytest.fixture(autouse=True)
+def patch_hadamard_if_needed():
+    """Automatically patch hadamard_transform in both dsa and csa modules if not installed."""
+    if not HAVE_HADAMARD:
+        with (
+            patch(
+                'megatron.core.transformer.experimental_attention_variant.dsa.hadamard_transform',
+                mock_hadamard_transform,
+            ),
+            patch(
+                'megatron.core.transformer.experimental_attention_variant.csa.rotate_activation',
+                lambda x: x * (x.size(-1) ** -0.5),
+            ),
+        ):
+            yield
+    else:
+        yield
+
+
+# ===========================================================================
+# Helper function tests
+# ===========================================================================
+
+
+class TestGetWindowTopkIdxs:
+    """Test get_window_topk_idxs helper."""
+
+    def test_basic_shape(self):
+        batch_size, seqlen, window_size = 2, 16, 4
+        idxs = get_window_topk_idxs(window_size, batch_size, seqlen, torch.device("cpu"))
+        assert idxs.shape == (batch_size, seqlen, window_size)
+
+    def test_causal_no_future(self):
+        """Indices should never exceed the query position."""
+        seqlen, window_size = 32, 8
+        idxs = get_window_topk_idxs(window_size, 1, seqlen, torch.device("cpu"))
+        for i in range(seqlen):
+            valid = idxs[0, i][idxs[0, i] >= 0]
+            assert torch.all(valid <= i), f"Position {i} has future indices"
+
+    def test_invalid_marked_minus_one(self):
+        """Early positions that cannot fill the window should use -1."""
+        seqlen, window_size = 8, 4
+        idxs = get_window_topk_idxs(window_size, 1, seqlen, torch.device("cpu"))
+        assert idxs[0, 0, 0] == -1 or idxs[0, 0, 0] == 0
+        for pos in range(window_size, seqlen):
+            assert torch.all(idxs[0, pos] >= 0), f"Position {pos} has invalid -1"
+
+    def test_window_larger_than_seqlen(self):
+        """Window larger than sequence length should still work."""
+        seqlen, window_size = 4, 16
+        idxs = get_window_topk_idxs(window_size, 1, seqlen, torch.device("cpu"))
+        assert idxs.shape == (1, seqlen, window_size)
+
+
+class TestGetCompressTopkIdxs:
+    """Test get_compress_topk_idxs helper."""
+
+    def test_basic_shape(self):
+        ratio, batch_size, seqlen, offset = 4, 2, 32, 32
+        idxs = get_compress_topk_idxs(ratio, batch_size, seqlen, offset, torch.device("cpu"))
+        n_compressed = seqlen // ratio
+        assert idxs.shape == (batch_size, seqlen, n_compressed)
+
+    def test_offset_applied(self):
+        """Valid indices should be >= offset."""
+        ratio, seqlen, offset = 4, 32, 100
+        idxs = get_compress_topk_idxs(ratio, 1, seqlen, offset, torch.device("cpu"))
+        valid = idxs[idxs >= 0]
+        if valid.numel() > 0:
+            assert torch.all(valid >= offset), "Valid indices should be offset"
+
+    def test_causal_no_future(self):
+        """Compressed indices should respect causality."""
+        ratio, seqlen, offset = 4, 32, 32
+        idxs = get_compress_topk_idxs(ratio, 1, seqlen, offset, torch.device("cpu"))
+        for i in range(seqlen):
+            n_valid = (i + 1) // ratio
+            valid = idxs[0, i][idxs[0, i] >= 0]
+            assert valid.numel() <= n_valid, f"Position {i} has too many valid compressed indices"
+
+    def test_ratio_128(self):
+        """Test with large compression ratio."""
+        ratio, seqlen, offset = 128, 256, 256
+        idxs = get_compress_topk_idxs(ratio, 1, seqlen, offset, torch.device("cpu"))
+        assert idxs.shape == (1, seqlen, seqlen // ratio)
+
+
+# ===========================================================================
+# unfused_compressed_sparse_attn tests
+# ===========================================================================
+
+
+class TestUnfusedCompressedSparseAttn:
+    """Test the unfused compressed sparse attention kernel."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        yield
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_output_shape(self):
+        """Test output shape of unfused compressed sparse attention."""
+        sq, b, np_, hn = 16, 2, 4, 64
+        n_kv = sq + sq // 4
+        topk = 8
+
+        query = torch.randn(sq, b, np_, hn, dtype=torch.bfloat16).cuda()
+        kv_full = torch.randn(n_kv, b, hn, dtype=torch.bfloat16).cuda()
+        attn_sink = torch.zeros(np_, dtype=torch.float32).cuda()
+        topk_indices = torch.randint(0, n_kv, (b, sq, topk), dtype=torch.int32).cuda()
+        softmax_scale = hn**-0.5
+
+        output = unfused_compressed_sparse_attn(
+            query, kv_full, attn_sink, topk_indices, softmax_scale
+        )
+
+        assert output.shape == (sq, b, np_ * hn)
+        assert output.dtype == query.dtype
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_invalid_indices_masked(self):
+        """Test that -1 indices are properly masked."""
+        sq, b, np_, hn = 8, 1, 2, 32
+        n_kv = sq
+        topk = 4
+
+        query = torch.randn(sq, b, np_, hn, dtype=torch.bfloat16).cuda()
+        kv_full = torch.randn(n_kv, b, hn, dtype=torch.bfloat16).cuda()
+        attn_sink = torch.zeros(np_, dtype=torch.float32).cuda()
+
+        topk_indices = torch.full((b, sq, topk), -1, dtype=torch.int32).cuda()
+        topk_indices[:, :, 0] = 0
+        softmax_scale = hn**-0.5
+
+        output = unfused_compressed_sparse_attn(
+            query, kv_full, attn_sink, topk_indices, softmax_scale
+        )
+        assert not torch.isnan(output).any(), "Output should not contain NaN"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_gradient_flow(self):
+        """Test that gradients flow through sparse attention."""
+        sq, b, np_, hn = 8, 1, 2, 32
+        n_kv = sq
+        topk = 4
+
+        query = torch.randn(sq, b, np_, hn, dtype=torch.float32).cuda().requires_grad_(True)
+        kv_full = torch.randn(n_kv, b, hn, dtype=torch.float32).cuda().requires_grad_(True)
+        attn_sink = torch.nn.Parameter(torch.zeros(np_, dtype=torch.float32).cuda())
+
+        topk_indices = torch.randint(0, n_kv, (b, sq, topk), dtype=torch.int32).cuda()
+        softmax_scale = hn**-0.5
+
+        output = unfused_compressed_sparse_attn(
+            query, kv_full, attn_sink, topk_indices, softmax_scale
+        )
+        loss = output.sum()
+        loss.backward()
+
+        assert query.grad is not None
+        assert kv_full.grad is not None
+        assert attn_sink.grad is not None
+
+
+# ===========================================================================
+# Compressor tests
+# ===========================================================================
+
+
+def _make_mla_config(
+    num_layers=4,
+    hidden_size=256,
+    num_attention_heads=16,
+    v_head_dim=64,
+    qk_pos_emb_head_dim=32,
+    csa_compress_ratios=None,
+    csa_window_size=8,
+    csa_dense_mode=False,
+    tensor_model_parallel_size=1,
+    sequence_parallel=False,
+    dsa_indexer_n_heads=8,
+    dsa_indexer_head_dim=64,
+    dsa_indexer_topk=8,
+    dsa_indexer_loss_coeff=0.0,
+    dsa_indexer_use_sparse_loss=False,
+):
+    """Helper to create MLATransformerConfig for CSA tests."""
+    if csa_compress_ratios is None:
+        csa_compress_ratios = [0] * num_layers
+    return MLATransformerConfig(
+        num_layers=num_layers,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        use_cpu_initialization=True,
+        bf16=True,
+        params_dtype=torch.bfloat16,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        sequence_parallel=sequence_parallel,
+        q_lora_rank=64,
+        kv_lora_rank=64,
+        qk_head_dim=v_head_dim - qk_pos_emb_head_dim,
+        qk_pos_emb_head_dim=qk_pos_emb_head_dim,
+        v_head_dim=v_head_dim,
+        rope_type='rope',
+        rotary_base=10000,
+        rotary_percent=1.0,
+        multi_latent_attention=True,
+        experimental_attention_variant='dsv4_hybrid',
+        csa_compress_ratios=csa_compress_ratios,
+        csa_window_size=csa_window_size,
+        csa_dense_mode=csa_dense_mode,
+        dsa_indexer_n_heads=dsa_indexer_n_heads,
+        dsa_indexer_head_dim=dsa_indexer_head_dim,
+        dsa_indexer_topk=dsa_indexer_topk,
+        dsa_indexer_loss_coeff=dsa_indexer_loss_coeff,
+        dsa_indexer_use_sparse_loss=dsa_indexer_use_sparse_loss,
+    )
+
+
+def _make_compressor_submodules():
+    """Create Compressor submodules spec."""
+    from megatron.core.extensions.transformer_engine import TELinear, TENorm
+    from megatron.core.transformer.spec_utils import ModuleSpec
+
+    return CompressorSubmodules(
+        linear_wkv=ModuleSpec(module=TELinear),
+        linear_wgate=ModuleSpec(module=TELinear),
+        norm=ModuleSpec(module=TENorm),
+    )
+
+
+def _make_csa_indexer_submodules():
+    """Create CSAIndexer submodules spec."""
+    from megatron.core.extensions.transformer_engine import TELinear, TENorm
+    from megatron.core.transformer.spec_utils import ModuleSpec
+
+    return CSAIndexerSubmodules(
+        linear_wq_b=ModuleSpec(module=TELinear),
+        linear_weights_proj=ModuleSpec(module=TELinear),
+        compressor=ModuleSpec(module=Compressor, submodules=_make_compressor_submodules()),
+    )
+
+
+def _make_csa_submodules():
+    """Create CompressedSparseAttention submodules spec."""
+    from megatron.core.transformer.spec_utils import ModuleSpec
+
+    return CompressedSparseAttentionSubmodules(
+        compressor=ModuleSpec(module=Compressor, submodules=_make_compressor_submodules()),
+        indexer=ModuleSpec(module=CSAIndexer, submodules=_make_csa_indexer_submodules()),
+    )
+
+
+# ===========================================================================
+# Compressor tests
+# ===========================================================================
+
+
+@pytest.mark.parametrize("compress_ratio", [4, 128])
+class TestCompressor:
+    """Test Compressor module."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self, request):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        cls = request.cls
+        cls.config = _make_mla_config(csa_compress_ratios=[4, 128, 4, 128])
+        cls.pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+
+        from megatron.core.models.common.embeddings import RotaryEmbedding
+
+        cls.rotary_pos_emb = RotaryEmbedding(
+            cls.config.qk_pos_emb_head_dim,
+            rotary_percent=cls.config.rotary_percent,
+            rotary_base=cls.config.rotary_base,
+            cp_group=cls.pg_collection.cp,
+        )
+
+        yield
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_compressor_output_shape(self, compress_ratio):
+        """Test that compressor produces correct output shape."""
+        seq_len = 256
+        batch_size = 2
+        head_dim = self.config.v_head_dim
+
+        compressor = Compressor(
+            config=self.config,
+            submodules=_make_compressor_submodules(),
+            compress_ratio=compress_ratio,
+            head_dim=head_dim,
+            rotate=False,
+            rotary_pos_emb=self.rotary_pos_emb,
+            pg_collection=self.pg_collection,
+        ).cuda()
+
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        output = compressor(x)
+
+        expected_len = seq_len // compress_ratio
+        assert output is not None
+        assert output.shape == (expected_len, batch_size, head_dim)
+        assert output.dtype == torch.bfloat16
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_compressor_too_short_input(self, compress_ratio):
+        """Test that compressor returns None when input is shorter than compress_ratio."""
+        short_len = compress_ratio - 1
+        batch_size = 2
+        head_dim = self.config.v_head_dim
+
+        compressor = Compressor(
+            config=self.config,
+            submodules=_make_compressor_submodules(),
+            compress_ratio=compress_ratio,
+            head_dim=head_dim,
+            rotate=False,
+            rotary_pos_emb=self.rotary_pos_emb,
+            pg_collection=self.pg_collection,
+        ).cuda()
+
+        x = torch.randn(short_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        output = compressor(x)
+        assert output is None
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_compressor_gradient_flow(self, compress_ratio):
+        """Test that gradients flow through the compressor."""
+        seq_len = 256
+        batch_size = 2
+        head_dim = self.config.v_head_dim
+
+        compressor = Compressor(
+            config=self.config,
+            submodules=_make_compressor_submodules(),
+            compress_ratio=compress_ratio,
+            head_dim=head_dim,
+            rotate=False,
+            rotary_pos_emb=self.rotary_pos_emb,
+            pg_collection=self.pg_collection,
+        ).cuda()
+
+        x = (
+            torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+        output = compressor(x)
+        loss = output.sum()
+        loss.backward()
+
+        assert x.grad is not None
+        for name, param in compressor.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Parameter {name} has no gradient"
+
+
+# ===========================================================================
+# CSAIndexer tests
+# ===========================================================================
+
+
+@pytest.mark.parametrize("seqlen", [32, 128])
+class TestCSAIndexer:
+    """Test CSAIndexer module basic functionality."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self, request):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        cls = request.cls
+        cls.compress_ratio = 4
+        cls.config = _make_mla_config(csa_compress_ratios=[4, 4, 4, 4], dsa_indexer_topk=8)
+        cls.pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+
+        from megatron.core.models.common.embeddings import RotaryEmbedding
+
+        cls.rotary_pos_emb = RotaryEmbedding(
+            cls.config.qk_pos_emb_head_dim,
+            rotary_percent=cls.config.rotary_percent,
+            rotary_base=cls.config.rotary_base,
+            cp_group=cls.pg_collection.cp,
+        )
+
+        cls.indexer = CSAIndexer(
+            config=cls.config,
+            submodules=_make_csa_indexer_submodules(),
+            compress_ratio=cls.compress_ratio,
+            rotary_pos_emb=cls.rotary_pos_emb,
+            pg_collection=cls.pg_collection,
+        )
+
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_csa_indexer_constructor(self, seqlen):
+        """Test CSAIndexer initialization."""
+        assert isinstance(self.indexer, CSAIndexer)
+        assert self.indexer.compress_ratio == self.compress_ratio
+        assert self.indexer.index_n_heads == self.config.dsa_indexer_n_heads
+        assert self.indexer.index_head_dim == self.config.dsa_indexer_head_dim
+        assert self.indexer.index_topk == self.config.dsa_indexer_topk
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_csa_indexer_forward(self, seqlen):
+        """Test CSAIndexer forward pass."""
+        batch_size = 2
+        self.indexer.cuda()
+
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        index_scores, topk_indices = self.indexer(x, qr)
+        n_compressed = seqlen // self.compress_ratio
+        effective_topk = min(self.config.dsa_indexer_topk, n_compressed)
+
+        assert index_scores.shape == (batch_size, seqlen, n_compressed)
+        assert topk_indices.shape == (batch_size, seqlen, effective_topk)
+        assert index_scores.dtype == torch.float32
+        assert topk_indices.dtype == torch.long
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_csa_indexer_forward_before_topk(self, seqlen):
+        """Test CSAIndexer forward_before_topk."""
+        batch_size = 2
+        self.indexer.cuda()
+
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        q, k, weights = self.indexer.forward_before_topk(x, qr)
+
+        assert q.shape == (
+            seqlen,
+            batch_size,
+            self.config.dsa_indexer_n_heads,
+            self.config.dsa_indexer_head_dim,
+        )
+        n_compressed = seqlen // self.compress_ratio
+        assert k.shape == (n_compressed, batch_size, self.config.dsa_indexer_head_dim)
+        assert weights.shape == (seqlen, batch_size, self.config.dsa_indexer_n_heads)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_csa_indexer_with_mask(self, seqlen):
+        """Test CSAIndexer with causal mask."""
+        batch_size = 2
+        self.indexer.cuda()
+
+        x = torch.randn(seqlen, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seqlen, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        n_compressed = seqlen // self.compress_ratio
+        causal_mask = torch.arange(n_compressed, device=x.device).unsqueeze(0).expand(seqlen, -1)
+        positions = torch.arange(1, seqlen + 1, device=x.device).unsqueeze(1)
+        causal_mask = (
+            torch.where(causal_mask >= positions // self.compress_ratio, float("-inf"), 0.0)
+            .unsqueeze(0)
+            .expand(batch_size, -1, -1)
+        )
+
+        index_scores, topk_indices = self.indexer(x, qr, mask=causal_mask)
+
+        effective_topk = min(self.config.dsa_indexer_topk, n_compressed)
+        assert index_scores.shape == (batch_size, seqlen, n_compressed)
+        assert topk_indices.shape == (batch_size, seqlen, effective_topk)
+
+
+# ===========================================================================
+# CompressedSparseAttention tests
+# ===========================================================================
+
+
+class TestCompressedSparseAttentionRatio1:
+    """Test CompressedSparseAttention with compress_ratio=1 (window-only)."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self, request):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        cls = request.cls
+        cls.config = _make_mla_config(csa_compress_ratios=[0, 0, 0, 0], csa_window_size=8)
+        cls.pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+
+        from megatron.core.models.common.embeddings import RotaryEmbedding
+
+        rotary_pos_emb = RotaryEmbedding(
+            cls.config.qk_pos_emb_head_dim,
+            rotary_percent=cls.config.rotary_percent,
+            rotary_base=cls.config.rotary_base,
+            cp_group=cls.pg_collection.cp,
+        )
+
+        cls.csa = CompressedSparseAttention(
+            config=cls.config,
+            submodules=_make_csa_submodules(),
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=cls.pg_collection,
+            rotary_pos_emb=rotary_pos_emb,
+            compress_ratio=0,
+        )
+
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_ratio1_no_compressor(self):
+        """With ratio=1, compressor and indexer should not be built."""
+        assert self.csa.compressor is None
+        assert self.csa.indexer is None
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_ratio1_forward(self):
+        """Test forward pass with window-only attention."""
+        seq_len = 32
+        batch_size = 2
+        np_ = self.config.num_attention_heads
+        hn = self.config.v_head_dim
+
+        self.csa.cuda()
+
+        query = torch.randn(seq_len, batch_size, np_, hn, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seq_len, batch_size, 1, hn, dtype=torch.bfloat16).cuda()
+        value = key.clone()
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        output = self.csa(query=query, key=key, value=value, attention_mask=None, x=x, qr=qr)
+
+        assert output.shape == (seq_len, batch_size, np_ * hn)
+        assert output.dtype == torch.bfloat16
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_ratio1_backward(self):
+        """Test backward pass with window-only attention."""
+        seq_len = 32
+        batch_size = 2
+        np_ = self.config.num_attention_heads
+        hn = self.config.v_head_dim
+
+        self.csa.train()
+        self.csa.cuda()
+
+        query = (
+            torch.randn(seq_len, batch_size, np_, hn, dtype=torch.float32)
+            .cuda()
+            .requires_grad_(True)
+        )
+        key = (
+            torch.randn(seq_len, batch_size, 1, hn, dtype=torch.float32).cuda().requires_grad_(True)
+        )
+        value = key.clone().detach().requires_grad_(True)
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        output = self.csa(query=query, key=key, value=value, attention_mask=None, x=x, qr=qr)
+        loss = output.sum()
+        loss.backward()
+
+        assert query.grad is not None
+        assert key.grad is not None
+
+
+@pytest.mark.parametrize("compress_ratio", [4, 128])
+class TestCompressedSparseAttentionCompressed:
+    """Test CompressedSparseAttention with compress_ratio > 1."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self, request):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        cls = request.cls
+        cls.config = _make_mla_config(
+            csa_compress_ratios=[4, 128, 4, 128],
+            csa_window_size=8,
+            dsa_indexer_topk=8,
+            dsa_indexer_loss_coeff=1.0,
+        )
+        cls.pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+
+        from megatron.core.models.common.embeddings import RotaryEmbedding
+
+        cls.rotary_pos_emb = RotaryEmbedding(
+            cls.config.qk_pos_emb_head_dim,
+            rotary_percent=cls.config.rotary_percent,
+            rotary_base=cls.config.rotary_base,
+            cp_group=cls.pg_collection.cp,
+        )
+
+        yield
+        Utils.destroy_model_parallel()
+
+    def _get_layer_number(self, compress_ratio):
+        """Return a layer_number (1-indexed) whose compress_ratio matches."""
+        for i, r in enumerate(self.config.csa_compress_ratios):
+            if r == compress_ratio:
+                return i + 1
+        raise ValueError(f"No layer with compress_ratio={compress_ratio}")
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_constructor(self, compress_ratio):
+        """Test that compressor/indexer are conditionally built."""
+        layer_number = self._get_layer_number(compress_ratio)
+        csa = CompressedSparseAttention(
+            config=self.config,
+            submodules=_make_csa_submodules(),
+            layer_number=layer_number,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=self.pg_collection,
+            rotary_pos_emb=self.rotary_pos_emb,
+            compress_ratio=compress_ratio,
+        ).cuda()
+
+        assert csa.compressor is not None
+        if compress_ratio == 4:
+            assert csa.indexer is not None
+        elif compress_ratio == 128:
+            assert csa.indexer is None
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_forward(self, compress_ratio):
+        """Test forward pass with compressed attention."""
+        seq_len = 256
+        batch_size = 2
+        np_ = self.config.num_attention_heads
+        hn = self.config.v_head_dim
+
+        layer_number = self._get_layer_number(compress_ratio)
+        csa = CompressedSparseAttention(
+            config=self.config,
+            submodules=_make_csa_submodules(),
+            layer_number=layer_number,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=self.pg_collection,
+            rotary_pos_emb=self.rotary_pos_emb,
+            compress_ratio=compress_ratio,
+        ).cuda()
+
+        query = torch.randn(seq_len, batch_size, np_, hn, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seq_len, batch_size, 1, hn, dtype=torch.bfloat16).cuda()
+        value = key.clone()
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        output = csa(query=query, key=key, value=value, attention_mask=None, x=x, qr=qr)
+
+        assert output.shape == (seq_len, batch_size, np_ * hn)
+        assert not torch.isnan(output).any()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_backward(self, compress_ratio):
+        """Test backward pass with compressed attention."""
+        seq_len = 256
+        batch_size = 2
+        np_ = self.config.num_attention_heads
+        hn = self.config.v_head_dim
+
+        layer_number = self._get_layer_number(compress_ratio)
+        csa = CompressedSparseAttention(
+            config=self.config,
+            submodules=_make_csa_submodules(),
+            layer_number=layer_number,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=self.pg_collection,
+            rotary_pos_emb=self.rotary_pos_emb,
+            compress_ratio=compress_ratio,
+        ).cuda()
+        csa.train()
+
+        query = (
+            torch.randn(seq_len, batch_size, np_, hn, dtype=torch.float32)
+            .cuda()
+            .requires_grad_(True)
+        )
+        key = (
+            torch.randn(seq_len, batch_size, 1, hn, dtype=torch.float32).cuda().requires_grad_(True)
+        )
+        value = key.clone().detach().requires_grad_(True)
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        output = csa(query=query, key=key, value=value, attention_mask=None, x=x, qr=qr)
+        loss = output.sum()
+        loss.backward()
+
+        assert query.grad is not None
+        assert key.grad is not None
+
+        for name, param in csa.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"Parameter {name} has no gradient"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_eval_mode(self, compress_ratio):
+        """Test forward pass in eval mode."""
+        seq_len = 256
+        batch_size = 2
+        np_ = self.config.num_attention_heads
+        hn = self.config.v_head_dim
+
+        layer_number = self._get_layer_number(compress_ratio)
+        csa = CompressedSparseAttention(
+            config=self.config,
+            submodules=_make_csa_submodules(),
+            layer_number=layer_number,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=self.pg_collection,
+            rotary_pos_emb=self.rotary_pos_emb,
+            compress_ratio=compress_ratio,
+        ).cuda()
+        csa.eval()
+
+        query = torch.randn(seq_len, batch_size, np_, hn, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seq_len, batch_size, 1, hn, dtype=torch.bfloat16).cuda()
+        value = key.clone()
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        with torch.no_grad():
+            output = csa(query=query, key=key, value=value, attention_mask=None, x=x, qr=qr)
+
+        assert output.shape == (seq_len, batch_size, np_ * hn)
+        assert not torch.isnan(output).any()
+
+
+# ===========================================================================
+# csa_dense_mode tests
+# ===========================================================================
+
+
+class TestCompressedSparseAttentionDenseMode:
+    """Test that csa_dense_mode=True disables the indexer for ratio=4 layers."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self, request):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(123)
+        model_parallel_cuda_manual_seed(123)
+
+        cls = request.cls
+        cls.config = _make_mla_config(
+            csa_compress_ratios=[4, 128, 4, 128], csa_window_size=8, csa_dense_mode=True
+        )
+        cls.pg_collection = ProcessGroupCollection.use_mpu_process_groups(required_pgs=['tp', 'cp'])
+
+        from megatron.core.models.common.embeddings import RotaryEmbedding
+
+        cls.rotary_pos_emb = RotaryEmbedding(
+            cls.config.qk_pos_emb_head_dim,
+            rotary_percent=cls.config.rotary_percent,
+            rotary_base=cls.config.rotary_base,
+            cp_group=cls.pg_collection.cp,
+        )
+
+        yield
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dense_mode_disables_indexer_for_ratio4(self):
+        """With csa_dense_mode=True, ratio=4 layers should NOT build an indexer."""
+        csa = CompressedSparseAttention(
+            config=self.config,
+            submodules=_make_csa_submodules(),
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=self.pg_collection,
+            rotary_pos_emb=self.rotary_pos_emb,
+            compress_ratio=4,
+        ).cuda()
+
+        assert csa.compress_ratio == 4
+        assert csa.compressor is not None, "Compressor should still be built"
+        assert csa.indexer is None, "Indexer should be disabled in dense mode"
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_dense_mode_forward_ratio4(self):
+        """Forward pass should work for ratio=4 in dense mode (uses all compressed positions)."""
+        seq_len = 256
+        batch_size = 2
+        np_ = self.config.num_attention_heads
+        hn = self.config.v_head_dim
+
+        csa = CompressedSparseAttention(
+            config=self.config,
+            submodules=_make_csa_submodules(),
+            layer_number=1,
+            attn_mask_type=AttnMaskType.causal,
+            attention_type='self',
+            pg_collection=self.pg_collection,
+            rotary_pos_emb=self.rotary_pos_emb,
+            compress_ratio=4,
+        ).cuda()
+
+        query = torch.randn(seq_len, batch_size, np_, hn, dtype=torch.bfloat16).cuda()
+        key = torch.randn(seq_len, batch_size, 1, hn, dtype=torch.bfloat16).cuda()
+        value = key.clone()
+        x = torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16).cuda()
+        qr = torch.randn(seq_len, batch_size, self.config.q_lora_rank, dtype=torch.bfloat16).cuda()
+
+        output = csa(query=query, key=key, value=value, attention_mask=None, x=x, qr=qr)
+
+        assert output.shape == (seq_len, batch_size, np_ * hn)
+        assert not torch.isnan(output).any()
diff --git a/tests/unit_tests/transformer/experimental_attention_variant/test_dsv4_hybrid_attention.py b/tests/unit_tests/transformer/experimental_attention_variant/test_dsv4_hybrid_attention.py
new file mode 100644
index 00000000000..5f9a3a74440
--- /dev/null
+++ b/tests/unit_tests/transformer/experimental_attention_variant/test_dsv4_hybrid_attention.py
@@ -0,0 +1,416 @@
+# Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
+
+from unittest.mock import patch
+
+import pytest
+import torch
+
+import megatron.core.parallel_state as parallel_state
+from megatron.core.extensions.transformer_engine import HAVE_TE
+from megatron.core.process_groups_config import ProcessGroupCollection
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.enums import AttnMaskType
+from megatron.core.transformer.transformer_config import MLATransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+try:
+    from fast_hadamard_transform import hadamard_transform as _hadamard_transform
+
+    HAVE_HADAMARD = True
+except ImportError:
+    HAVE_HADAMARD = False
+    _hadamard_transform = None
+
+_SEED = 42
+
+
+def _mock_hadamard_transform(x: torch.Tensor, scale: float = 1.0) -> torch.Tensor:
+    return x * scale
+
+
+@pytest.fixture(autouse=True)
+def patch_hadamard_if_needed():
+    """Patch hadamard_transform in dsa/csa modules if the library is not installed."""
+    if not HAVE_HADAMARD:
+        with (
+            patch(
+                'megatron.core.transformer.experimental_attention_variant.dsa.hadamard_transform',
+                _mock_hadamard_transform,
+            ),
+            patch(
+                'megatron.core.transformer.experimental_attention_variant.csa.rotate_activation',
+                lambda x: x * (x.size(-1) ** -0.5),
+            ),
+        ):
+            yield
+    else:
+        yield
+
+
+# ---------------------------------------------------------------------------
+# Config / spec helpers
+# ---------------------------------------------------------------------------
+
+
+def _make_config(
+    num_layers=4,
+    hidden_size=256,
+    num_attention_heads=16,
+    v_head_dim=64,
+    qk_pos_emb_head_dim=32,
+    q_lora_rank=64,
+    o_groups=8,
+    o_lora_rank=64,
+    csa_compress_ratios=None,
+    csa_window_size=8,
+    tensor_model_parallel_size=1,
+    sequence_parallel=False,
+    dsa_indexer_n_heads=8,
+    dsa_indexer_head_dim=64,
+    dsa_indexer_topk=8,
+    dsa_indexer_loss_coeff=0.0,
+):
+    """Create an MLATransformerConfig for DSv4 hybrid attention tests."""
+    if csa_compress_ratios is None:
+        csa_compress_ratios = [0, 4, 128, 4]
+    return MLATransformerConfig(
+        num_layers=num_layers,
+        hidden_size=hidden_size,
+        num_attention_heads=num_attention_heads,
+        use_cpu_initialization=True,
+        bf16=True,
+        params_dtype=torch.bfloat16,
+        add_bias_linear=False,
+        tensor_model_parallel_size=tensor_model_parallel_size,
+        sequence_parallel=sequence_parallel,
+        q_lora_rank=q_lora_rank,
+        kv_lora_rank=v_head_dim - qk_pos_emb_head_dim,
+        qk_head_dim=v_head_dim - qk_pos_emb_head_dim,
+        qk_pos_emb_head_dim=qk_pos_emb_head_dim,
+        v_head_dim=v_head_dim,
+        o_groups=o_groups,
+        o_lora_rank=o_lora_rank,
+        rope_type='rope',
+        rotary_base=10000,
+        rotary_percent=1.0,
+        multi_latent_attention=True,
+        experimental_attention_variant='dsv4_hybrid',
+        csa_compress_ratios=csa_compress_ratios,
+        csa_window_size=csa_window_size,
+        dsa_indexer_n_heads=dsa_indexer_n_heads,
+        dsa_indexer_head_dim=dsa_indexer_head_dim,
+        dsa_indexer_topk=dsa_indexer_topk,
+        dsa_indexer_loss_coeff=dsa_indexer_loss_coeff,
+    )
+
+
+def _make_attention_spec(config):
+    """Build the full DSv4HybridSelfAttention ModuleSpec using the canonical spec builder."""
+    from megatron.core.extensions.transformer_engine_spec_provider import TESpecProvider
+    from megatron.core.models.gpt.experimental_attention_variant_module_specs import (
+        get_dsv4_hybrid_module_spec_for_backend,
+    )
+
+    return get_dsv4_hybrid_module_spec_for_backend(config=config, backend=TESpecProvider())
+
+
+def _build_attention(config, layer_number, pg_collection):
+    """Instantiate a DSv4HybridSelfAttention from config."""
+    from megatron.core.transformer.spec_utils import build_module
+
+    spec = _make_attention_spec(config)
+    return build_module(spec, config=config, layer_number=layer_number, pg_collection=pg_collection)
+
+
+# ===========================================================================
+# Constructor tests
+# ===========================================================================
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available")
+class TestDSv4HybridAttentionConstructor:
+    """Test construction of DSv4HybridSelfAttention across TP sizes."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_basic_construction(self):
+        """Verify the layer builds and has the expected sub-modules."""
+        from megatron.core.transformer.experimental_attention_variant.deepseek_v4_hybrid_attention import (
+            DSv4HybridSelfAttention,
+        )
+
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        config = _make_config()
+        pg = ProcessGroupCollection.use_mpu_process_groups()
+        attn = _build_attention(config, layer_number=1, pg_collection=pg)
+
+        assert isinstance(attn, DSv4HybridSelfAttention)
+        assert hasattr(attn, 'linear_q_down_proj')
+        assert hasattr(attn, 'linear_q_up_proj')
+        assert hasattr(attn, 'linear_kv_proj')
+        assert hasattr(attn, 'linear_proj')
+        assert hasattr(attn, 'linear_o_group_proj')
+        assert hasattr(attn, 'core_attention')
+        assert hasattr(attn, 'q_layernorm')
+        assert hasattr(attn, 'kv_layernorm')
+
+    def test_q_head_dim_equals_v_head_dim(self):
+        """q_head_dim must equal v_head_dim for DSv4 hybrid."""
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        config = _make_config()
+        pg = ProcessGroupCollection.use_mpu_process_groups()
+        attn = _build_attention(config, layer_number=1, pg_collection=pg)
+
+        assert attn.q_head_dim == config.v_head_dim
+
+    @pytest.mark.parametrize("layer_number", [1, 2, 3, 4])
+    def test_rope_base_varies_with_compress_ratio(self, layer_number):
+        """Layers with compress_ratio > 1 should use csa_compress_rotary_base."""
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        ratios = [0, 4, 128, 4]
+        config = _make_config(csa_compress_ratios=ratios)
+        pg = ProcessGroupCollection.use_mpu_process_groups()
+        attn = _build_attention(config, layer_number=layer_number, pg_collection=pg)
+
+        ratio = ratios[layer_number - 1]
+        if ratio > 1:
+            expected_base = config.csa_compress_rotary_base
+        else:
+            expected_base = config.rotary_base
+
+        # inv_freq is derived from rotary_base; verify the correct base was used
+        dim = config.qk_pos_emb_head_dim
+        recomputed_inv_freq = 1.0 / (
+            expected_base ** (torch.arange(0, dim, 2, dtype=torch.float32) / dim)
+        )
+        assert torch.allclose(
+            attn.rotary_pos_emb.inv_freq.cpu(), recomputed_inv_freq, rtol=1e-5, atol=1e-5
+        )
+
+
+# ===========================================================================
+# Forward / backward tests
+# ===========================================================================
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available")
+class TestDSv4HybridAttentionForwardBackward:
+    """Test forward and backward passes of DSv4HybridSelfAttention."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self, request):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        cls = request.cls
+        cls.config = _make_config(dsa_indexer_loss_coeff=1.0)
+        cls.pg = ProcessGroupCollection.use_mpu_process_groups()
+
+        yield
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.parametrize("layer_number", [1, 2, 3, 4])
+    def test_forward_output_shape(self, layer_number):
+        """Forward should produce [sq, b, hidden_size] output."""
+        seq_len = 256
+        batch_size = 2
+
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        attn = _build_attention(
+            self.config, layer_number=layer_number, pg_collection=self.pg
+        ).cuda()
+
+        hidden = torch.randn(
+            seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+
+        output, bias = attn(hidden_states=hidden, attention_mask=None)
+
+        assert output.shape == (seq_len, batch_size, self.config.hidden_size)
+        assert output.dtype == torch.bfloat16
+        assert not torch.isnan(output).any()
+
+    @pytest.mark.parametrize("layer_number", [1, 2])
+    def test_backward_gradient_flow(self, layer_number):
+        """Backward should produce gradients for all trainable parameters."""
+        seq_len = 256
+        batch_size = 2
+
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        attn = _build_attention(
+            self.config, layer_number=layer_number, pg_collection=self.pg
+        ).cuda()
+        attn.train()
+
+        hidden = (
+            torch.randn(seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16)
+            .cuda()
+            .requires_grad_(True)
+        )
+
+        output, bias = attn(hidden_states=hidden, attention_mask=None)
+        loss = output.sum()
+        loss.backward()
+
+        assert hidden.grad is not None, "No gradient on hidden_states"
+        for name, param in attn.named_parameters():
+            if param.requires_grad:
+                assert param.grad is not None, f"No gradient for parameter {name}"
+
+    def test_eval_mode(self):
+        """Forward should work in eval mode."""
+        seq_len = 128
+        batch_size = 2
+
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        attn = _build_attention(self.config, layer_number=1, pg_collection=self.pg).cuda()
+        attn.eval()
+
+        hidden = torch.randn(
+            seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+
+        with torch.no_grad():
+            output, bias = attn(hidden_states=hidden, attention_mask=None)
+
+        assert output.shape == (seq_len, batch_size, self.config.hidden_size)
+        assert not torch.isnan(output).any()
+
+    def test_different_seq_lengths(self):
+        """Forward should handle various sequence lengths."""
+        batch_size = 2
+
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        attn = _build_attention(self.config, layer_number=2, pg_collection=self.pg).cuda()
+
+        for seq_len in [64, 128, 256]:
+            hidden = torch.randn(
+                seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16
+            ).cuda()
+            output, bias = attn(hidden_states=hidden, attention_mask=None)
+            assert output.shape == (seq_len, batch_size, self.config.hidden_size)
+
+
+# ===========================================================================
+# get_query_key_value_tensors tests
+# ===========================================================================
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available")
+class TestDSv4HybridQKV:
+    """Test get_query_key_value_tensors internals."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self, request):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        cls = request.cls
+        cls.config = _make_config()
+        cls.pg = ProcessGroupCollection.use_mpu_process_groups()
+
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_qkv_shapes(self):
+        """Query, key, value should have correct shapes."""
+        seq_len = 64
+        batch_size = 2
+
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        attn = _build_attention(self.config, layer_number=1, pg_collection=self.pg).cuda()
+        hidden = torch.randn(
+            seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+
+        q, k, v, q_compressed, kv_compressed = attn.get_query_key_value_tensors(hidden)
+
+        n_heads = self.config.num_attention_heads
+        v_dim = self.config.v_head_dim
+
+        assert q.shape == (seq_len, batch_size, n_heads, v_dim)
+        # key and value are single-head (MQA-style) with an extra head dim
+        assert k.shape[-1] == v_dim
+        assert v.shape[-1] == v_dim
+
+    def test_key_equals_value(self):
+        """In the wkv path, key and value should be the same tensor."""
+        seq_len = 64
+        batch_size = 2
+
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        attn = _build_attention(self.config, layer_number=1, pg_collection=self.pg).cuda()
+        hidden = torch.randn(
+            seq_len, batch_size, self.config.hidden_size, dtype=torch.bfloat16
+        ).cuda()
+
+        q, k, v, _, _ = attn.get_query_key_value_tensors(hidden)
+        assert torch.equal(k, v), "key and value should be identical in wkv path"
+
+
+# ===========================================================================
+# Grouped output projection tests
+# ===========================================================================
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available")
+class TestDSv4HybridGroupedOutput:
+    """Test that grouped output projection (wo_a) parameters are created."""
+
+    @pytest.fixture(scope='class', autouse=True)
+    def setup_method(self):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1, pipeline_model_parallel_size=1
+        )
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_o_group_proj_shape(self):
+        """linear_o_group_proj should have the correct shape."""
+        torch.manual_seed(_SEED)
+        model_parallel_cuda_manual_seed(_SEED)
+
+        o_groups = 8
+        o_lora_rank = 64
+        config = _make_config(o_groups=o_groups, o_lora_rank=o_lora_rank)
+        pg = ProcessGroupCollection.use_mpu_process_groups()
+        attn = _build_attention(config, layer_number=1, pg_collection=pg)
+
+        expected_out = o_groups * o_lora_rank
+        expected_in = (config.v_head_dim * config.num_attention_heads) // o_groups
+        assert attn.linear_o_group_proj.shape == (expected_out, expected_in)
+        assert attn.linear_o_group_proj.requires_grad
diff --git a/tests/unit_tests/transformer/moe/test_aux_loss.py b/tests/unit_tests/transformer/moe/test_aux_loss.py
index ccd11bf29af..3b4697fc71a 100644
--- a/tests/unit_tests/transformer/moe/test_aux_loss.py
+++ b/tests/unit_tests/transformer/moe/test_aux_loss.py
@@ -212,8 +212,9 @@ def new_router(self, **kwargs):
         new_transformer_config = dataclasses.replace(self.default_transformer_config, **kwargs)
 
         # Create the router with the updated config
-        router = TopKRouter(config=new_transformer_config, pg_collection=pg_collection)
-        router.set_layer_number(0)
+        router = TopKRouter(
+            config=new_transformer_config, pg_collection=pg_collection, layer_number=0
+        )
         return router
 
     def teardown_method(self, method):
@@ -626,8 +627,9 @@ def new_router(self, **kwargs):
         """Create a new router with updated configuration."""
         pg_collection = get_default_pg_collection()
         new_transformer_config = dataclasses.replace(self.default_transformer_config, **kwargs)
-        router = TopKRouter(config=new_transformer_config, pg_collection=pg_collection)
-        router.set_layer_number(0)
+        router = TopKRouter(
+            config=new_transformer_config, pg_collection=pg_collection, layer_number=0
+        )
         return router
 
     @pytest.mark.internal
diff --git a/tests/unit_tests/transformer/moe/test_paged_stashing.py b/tests/unit_tests/transformer/moe/test_paged_stashing.py
new file mode 100644
index 00000000000..262346d0609
--- /dev/null
+++ b/tests/unit_tests/transformer/moe/test_paged_stashing.py
@@ -0,0 +1,409 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from megatron.core import config
+from megatron.core.extensions.transformer_engine import HAVE_TE
+from megatron.core.fp8_utils import get_fp8_context
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
+from megatron.core.transformer.moe.moe_layer import MoELayer
+from megatron.core.transformer.moe.moe_utils import get_align_size_for_quantization
+from megatron.core.transformer.moe.paged_stash import (
+    check_paged_stash_overflow,
+    paged_stash_init_chunk_handler,
+    paged_stash_reset,
+)
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+from megatron.training.initialize import _set_random_seed
+from tests.unit_tests.test_utilities import Utils
+
+
+def _global_tokens_per_expert_from_local_routing_map(routing_map: torch.Tensor) -> torch.Tensor:
+    """Per-expert token counts from a local routing map, summed across the default process group.
+
+    ``routing_map`` is shaped [num_local_token_rows, num_experts] (as in
+    ``_HybridEPManager``). Tests here assume world size equals expert-parallel size (all GPUs
+    are EP ranks); ``all_reduce`` on the world group aggregates disjoint local maps.
+    """
+    counts = routing_map.sum(dim=0).to(torch.int64)
+    if torch.distributed.is_initialized() and torch.distributed.get_world_size() > 1:
+        torch.distributed.all_reduce(counts, op=torch.distributed.ReduceOp.SUM)
+    return counts
+
+
+def _tokens_per_expert_from_routing_map(routing_map: torch.Tensor, layer: MoELayer) -> torch.Tensor:
+    """Per-local-expert assignment counts from the routing map (columns for this EP rank)."""
+    counts = _global_tokens_per_expert_from_local_routing_map(routing_map)
+    idx = torch.as_tensor(layer.local_expert_indices, device=counts.device, dtype=torch.long)
+    return counts[idx].to(torch.int64).clone()
+
+
+def _pad_token_counts_to_align_size(
+    tokens_per_expert: torch.Tensor, pad_multiple: int
+) -> torch.Tensor:
+    """Round each count up to a multiple of ``pad_multiple`` (``n + (-n % m)`` like budget)."""
+    t = tokens_per_expert.to(torch.int64)
+    return t + (-t % pad_multiple)
+
+
+class MoEModelTestContainer:
+    def __init__(
+        self,
+        tp_size,
+        ep_size,
+        pp_size,
+        cp_size=1,
+        moe_tp_size=None,
+        data_parallel_random_init=False,
+        num_moe_experts=8,
+        num_layers=1,
+        moe_router_topk=2,
+        moe_router_load_balancing_type="aux_loss",
+        moe_token_dispatcher_type="alltoall",
+        moe_expert_capacity_factor=None,
+        moe_pad_expert_input_to_capacity=False,
+        moe_aux_loss_coeff=0.1,
+        test_dtype=torch.float32,
+        **kwargs,
+    ):
+        self.num_local_experts = num_moe_experts // ep_size
+        self.num_layers = num_layers
+        self.test_dtype = test_dtype
+        if moe_tp_size is None:
+            moe_tp_size = tp_size
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=tp_size,
+            pipeline_model_parallel_size=pp_size,
+            expert_model_parallel_size=ep_size,
+            context_parallel_size=cp_size,
+            expert_tensor_parallel_size=moe_tp_size,
+        )
+        _set_random_seed(seed_=123, data_parallel_random_init=data_parallel_random_init)
+        self.config = TransformerConfig(
+            tensor_model_parallel_size=tp_size,
+            expert_model_parallel_size=ep_size,
+            pipeline_model_parallel_size=pp_size,
+            context_parallel_size=cp_size,
+            expert_tensor_parallel_size=moe_tp_size,
+            fp8='e4m3',
+            fp8_recipe='mxfp8',
+            fp8_wgrad=True,
+            fp8_amax_compute_algo='most_recent',
+            fp8_amax_history_len=1,
+            fp8_interval=1,
+            fp8_margin=0,
+            moe_router_topk=moe_router_topk,
+            num_moe_experts=num_moe_experts,
+            moe_router_load_balancing_type=moe_router_load_balancing_type,
+            moe_token_dispatcher_type=moe_token_dispatcher_type,
+            moe_expert_capacity_factor=moe_expert_capacity_factor,
+            moe_pad_expert_input_to_capacity=moe_pad_expert_input_to_capacity,
+            moe_aux_loss_coeff=moe_aux_loss_coeff,
+            num_layers=num_layers,
+            moe_router_dtype="fp32",
+            hidden_size=kwargs.get("hidden_size", 16),
+            num_attention_heads=kwargs.get("num_attention_heads", 8),
+            use_cpu_initialization=kwargs.get("use_cpu_initialization", True),
+            sequence_parallel=tp_size > 1,
+            add_bias_linear=kwargs.get("add_bias_linear", False),
+            moe_permute_fusion=kwargs.get("moe_permute_fusion", False),
+            moe_flex_dispatcher_backend=kwargs.get("moe_flex_dispatcher_backend", None),
+            moe_grouped_gemm=kwargs.get("moe_grouped_gemm", False),
+            moe_paged_stash=kwargs.get("moe_paged_stash", False),
+            moe_expert_rank_capacity_factor=kwargs.get("moe_expert_rank_capacity_factor", None),
+            moe_router_padding_for_fp8=kwargs.get("moe_router_padding_for_fp8", True),
+            use_transformer_engine_op_fuser=kwargs.get("use_transformer_engine_op_fuser", False),
+            moe_mlp_glu_interleave_size=kwargs.get("moe_mlp_glu_interleave_size", None),
+            moe_router_padding_for_quantization=kwargs.get(
+                "moe_router_padding_for_quantization", False
+            ),
+            gated_linear_unit=kwargs.get("gated_linear_unit", False),
+            activation_func=kwargs.get("activation_func", F.gelu),
+            moe_router_force_biased=kwargs.get("moe_router_force_biased", None),
+            moe_paged_stash_buffer_size_factor_cuda=0.5,
+            moe_paged_stash_buffer_size_factor_cpu=1.5,
+        )
+        self.moe_layers = [self._create_moe_layer(layer_number=i) for i in range(num_layers)]
+        self.moe_layer = self.moe_layers[0]
+
+    def _create_moe_layer(self, layer_number=0):
+        transformer_layer_spec = get_gpt_layer_with_transformer_engine_spec(
+            num_experts=self.config.num_moe_experts, moe_grouped_gemm=True
+        )
+        quantization_context = get_fp8_context(self.config, layer_number, is_init=True)
+        with quantization_context:
+            moe_layer = (
+                MoELayer(
+                    self.config,
+                    transformer_layer_spec.submodules.mlp.submodules,
+                    layer_number=layer_number,
+                )
+                .cuda()
+                .to(dtype=self.test_dtype)
+            )
+            return moe_layer
+
+    def zero_grad(self):
+        for layer in self.moe_layers:
+            layer.zero_grad()
+
+    def __del__(self):
+        torch.distributed.barrier()
+        torch.cuda.synchronize()
+        Utils.destroy_model_parallel()
+
+    def destroy(self):
+        Utils.destroy_model_parallel()
+
+
+def _forward_backward_all_layers(container: MoEModelTestContainer, hidden_states: torch.Tensor):
+    """Forward/backward all MoE layers; returns output, input grad, last layer routing state."""
+    initial_hidden_states = hidden_states.cuda().requires_grad_(True)
+    hidden_states = initial_hidden_states
+    quantization_context = get_fp8_context(container.config)
+    with quantization_context:
+        for layer in container.moe_layers:
+            hidden_states, _ = layer(hidden_states)
+        output = hidden_states
+    last_layer = container.moe_layers[-1]
+    comm = getattr(last_layer.token_dispatcher, "_comm_manager", None)
+    routing_map = getattr(comm, "routing_map", None)
+    tokens_per_expert = (
+        comm.get_number_of_tokens_per_expert()
+        if comm is not None and hasattr(comm, "get_number_of_tokens_per_expert")
+        else None
+    )
+    output.backward(torch.ones_like(output))
+    return (output.detach(), initial_hidden_states.grad, routing_map, tokens_per_expert)
+
+
+def is_hybrid_ep_available():
+    from megatron.core.transformer.moe.fused_a2a import HAVE_HYBRIDEP
+
+    return HAVE_HYBRIDEP
+
+
+def _te_grouped_mlp_op_fuser_environment_supported() -> bool:
+    """Cheap gate matching the start of ``TEGroupedMLP._is_fused_impl_supported`` (experts.py)."""
+    if not HAVE_TE:
+        return False
+    try:
+        from transformer_engine.pytorch.ops import GroupedLinear, ScaledSwiGLU  # noqa: F401
+    except ImportError:
+        return False
+    return is_te_min_version("2.14.0")
+
+
+_TE_GROUPED_MLP_OP_FUSER_SKIP_REASON = (
+    "TEGroupedMLP op fuser (tests use use_transformer_engine_op_fuser=True) requires TE>=2.14 "
+    "with GroupedLinear/ScaledSwiGLU ops"
+)
+
+
+@pytest.mark.skipif(
+    not _te_grouped_mlp_op_fuser_environment_supported(),
+    reason=_TE_GROUPED_MLP_OP_FUSER_SKIP_REASON,
+)
+@pytest.mark.skipif(not is_hybrid_ep_available(), reason="Hybrid EP are not available")
+class TestPagedStashing:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.flaky_in_dev
+    def test_forward_backward_4_layers(self):
+        """Test paged stashing with 4 MoE layers: ref run vs paged run match."""
+        if not is_hybrid_ep_available():
+            pytest.skip("Hybrid EP is not available")
+
+        config.ENABLE_EXPERIMENTAL = True
+
+        container = MoEModelTestContainer(
+            tp_size=1,
+            ep_size=4,
+            pp_size=1,
+            num_moe_experts=8,
+            num_layers=4,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="flex",
+            moe_permute_fusion=True,
+            hidden_size=1024,
+            moe_flex_dispatcher_backend="hybridep",
+            test_dtype=torch.bfloat16,
+            moe_grouped_gemm=True,
+            moe_use_legacy_grouped_gemm=False,
+            moe_paged_stash=True,
+            moe_expert_rank_capacity_factor=1.5,
+            use_transformer_engine_op_fuser=True,
+            moe_mlp_glu_interleave_size=32,
+            moe_router_padding_for_quantization=True,
+            gated_linear_unit=True,
+            activation_func=F.silu,
+        )
+
+        seq_length = 1024
+        batch_size = 1
+        hidden_size = container.config.hidden_size
+        hidden_states = torch.randn((seq_length, batch_size, hidden_size), dtype=torch.bfloat16)
+
+        # First iteration: capture schedule, capacity, etc.
+        paged_stash_reset(True, config=container.config)
+        paged_stash_init_chunk_handler(1, 0)
+        output_ref, hidden_states_grad_ref, routing_map_ref, tokens_per_expert_ref = (
+            _forward_backward_all_layers(container, hidden_states)
+        )
+
+        container.zero_grad()
+
+        # Second iteration: run with paged stash.
+        paged_stash_reset(True, config=container.config)
+        paged_stash_init_chunk_handler(1, 0)
+        output, hidden_states_grad, routing_map, tokens_per_expert = _forward_backward_all_layers(
+            container, hidden_states
+        )
+
+        overflow = check_paged_stash_overflow()
+        assert overflow.any().item() == 0
+
+        assert torch.allclose(
+            output, output_ref, atol=1e-4, rtol=1e-4
+        ), f"output != output_ref: max diff = {(output - output_ref).abs().max().item()}"
+        assert torch.allclose(hidden_states_grad, hidden_states_grad_ref, atol=1e-4, rtol=1e-4), (
+            f"hidden_states_grad != ref: max diff = "
+            f"{(hidden_states_grad - hidden_states_grad_ref).abs().max().item()}"
+        )
+        if routing_map is not None and tokens_per_expert is not None:
+            num_tokens_per_ep_rank = tokens_per_expert.sum().item()
+            assert (
+                num_tokens_per_ep_rank > 0
+            ), f"num_tokens_per_ep_rank={num_tokens_per_ep_rank} (expected > 0)"
+            assert routing_map_ref is not None and tokens_per_expert_ref is not None
+            tpe_f = tokens_per_expert.float()
+            ref_f = tokens_per_expert_ref.float()
+            assert torch.allclose(
+                tpe_f, ref_f, atol=1e-4, rtol=1e-4
+            ), f"tokens_per_expert != ref: max diff = {(tpe_f - ref_f).abs().max().item()}"
+
+
+@pytest.mark.skipif(
+    not _te_grouped_mlp_op_fuser_environment_supported(),
+    reason=_TE_GROUPED_MLP_OP_FUSER_SKIP_REASON,
+)
+@pytest.mark.skipif(not is_hybrid_ep_available(), reason="Hybrid EP are not available")
+class TestPagedStashingOverBudget:
+    def setup_method(self, method):
+        pass
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.internal
+    @pytest.mark.flaky_in_dev
+    def test_overload_factor_and_over_budget(self):
+        """Budget matches HybridEP setup_metadata; over_budget matches map-derived load."""
+        if not is_hybrid_ep_available():
+            pytest.skip("Hybrid EP is not available")
+
+        config.ENABLE_EXPERIMENTAL = True
+
+        container = MoEModelTestContainer(
+            tp_size=1,
+            ep_size=4,
+            pp_size=1,
+            num_moe_experts=8,
+            num_layers=4,
+            moe_router_topk=2,
+            moe_router_load_balancing_type="aux_loss",
+            moe_token_dispatcher_type="flex",
+            moe_permute_fusion=True,
+            hidden_size=1024,
+            moe_flex_dispatcher_backend="hybridep",
+            test_dtype=torch.bfloat16,
+            moe_grouped_gemm=True,
+            moe_use_legacy_grouped_gemm=False,
+            moe_paged_stash=True,
+            moe_expert_rank_capacity_factor=1.5,
+            use_transformer_engine_op_fuser=True,
+            moe_mlp_glu_interleave_size=32,
+            moe_router_padding_for_quantization=True,
+            gated_linear_unit=True,
+            activation_func=F.silu,
+            moe_router_force_biased=1,
+        )
+
+        seq_length = 1024
+        batch_size = 1
+        topk = container.config.moe_router_topk
+        capacity_factor = container.config.moe_expert_rank_capacity_factor
+        hidden_states = torch.randn(
+            (seq_length, batch_size, container.config.hidden_size), dtype=torch.bfloat16
+        )
+
+        num_tokens = seq_length * batch_size * topk
+        pad_multiple = get_align_size_for_quantization(container.config)
+        budget = int(num_tokens * capacity_factor)
+        budget += -budget % pad_multiple
+
+        paged_stash_reset(True, config=container.config)
+        paged_stash_init_chunk_handler(1, 0)
+        _forward_backward_all_layers(container, hidden_states)
+
+        overflow = check_paged_stash_overflow()
+        num_layers = len(container.moe_layers)
+        stash_cuda = container.config.moe_paged_stash_buffer_size_factor_cuda
+        stash_cpu = container.config.moe_paged_stash_buffer_size_factor_cpu
+        stash_buffer_size = num_tokens * num_layers * (stash_cuda + stash_cpu)
+
+        total_tokens = 0
+        for layer_idx, layer in enumerate(container.moe_layers):
+            comm = getattr(layer.token_dispatcher, "_comm_manager", None)
+            routing_map = getattr(comm, "routing_map", None) if comm is not None else None
+            over_budget_tensor = (
+                layer.token_dispatcher.check_over_budget()
+                if hasattr(layer.token_dispatcher, "check_over_budget")
+                else None
+            )
+            over_budget = over_budget_tensor.item() if over_budget_tensor is not None else False
+
+            assert routing_map is not None, f"layer {layer_idx}: routing_map is None"
+            assert routing_map.dim() == 2, f"layer {layer_idx}: expected 2D routing_map"
+            assert routing_map.shape[1] == container.config.num_moe_experts, (
+                f"layer {layer_idx}: routing_map has {routing_map.shape[1]} experts, "
+                f"expected {container.config.num_moe_experts}"
+            )
+            tokens_per_expert_from_map = _tokens_per_expert_from_routing_map(routing_map, layer)
+            tokens_per_expert_from_map_padded = _pad_token_counts_to_align_size(
+                tokens_per_expert_from_map, pad_multiple
+            )
+            tokens_per_ep_rank_from_map = tokens_per_expert_from_map_padded.sum().item()
+            total_tokens += tokens_per_ep_rank_from_map
+
+            # Padded map-derived tokens strictly over budget iff dispatcher reports over_budget
+            if tokens_per_ep_rank_from_map > budget:
+                assert over_budget, (
+                    f"layer {layer_idx}: tokens_per_ep_rank_from_map "
+                    f"({tokens_per_ep_rank_from_map}) > budget ({budget}), "
+                    f"but over_budget flag was not set"
+                )
+            else:
+                assert not over_budget, (
+                    f"layer {layer_idx}: tokens_per_ep_rank_from_map "
+                    f"({tokens_per_ep_rank_from_map}) <= budget ({budget}), "
+                    f"but over_budget flag was set"
+                )
+
+        overflow_set = overflow.any().item()
+        stash_exceeded = total_tokens > stash_buffer_size
+        assert overflow_set == stash_exceeded, (
+            f"overflow {overflow_set} should match total_tokens > stash_buffer_size "
+            f"({total_tokens} > {stash_buffer_size})"
+        )
diff --git a/tests/unit_tests/transformer/moe/test_routers.py b/tests/unit_tests/transformer/moe/test_routers.py
index 8f3dbbe96e0..40802ac65dc 100644
--- a/tests/unit_tests/transformer/moe/test_routers.py
+++ b/tests/unit_tests/transformer/moe/test_routers.py
@@ -8,8 +8,12 @@
 
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_submodules
 from megatron.core.transformer.moe.moe_layer import MoELayer
-from megatron.core.transformer.moe.moe_utils import get_updated_expert_bias, router_gating_linear
-from megatron.core.transformer.moe.router import Router
+from megatron.core.transformer.moe.moe_utils import (
+    get_default_pg_collection,
+    get_updated_expert_bias,
+    router_gating_linear,
+)
+from megatron.core.transformer.moe.router import Router, TopKRouter
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.training.initialize import _set_random_seed
 from tests.unit_tests.test_utilities import Utils
@@ -563,3 +567,137 @@ def test_router_gating_linear_bias(router_dtype):
     assert torch.allclose(inp.grad, ref_inp.grad, **tols)
     assert torch.allclose(weight.grad, ref_weight.grad, **tols)
     assert torch.allclose(bias.grad, ref_bias.grad, **tols)
+
+
+# ============================================================
+# Hash-based MoE routing tests
+# ============================================================
+
+
+def _hash_routing_config(**overrides):
+    """Create a base TransformerConfig suitable for hash routing tests."""
+    defaults = dict(
+        num_layers=2,
+        hidden_size=16,
+        num_attention_heads=8,
+        num_moe_experts=4,
+        moe_router_topk=2,
+        moe_router_load_balancing_type="aux_loss",
+        moe_aux_loss_coeff=0.0,
+        moe_router_dtype="fp32",
+        add_bias_linear=False,
+        use_cpu_initialization=True,
+        moe_n_hash_layers=1,
+        actual_vocab_size=128,
+    )
+    defaults.update(overrides)
+    return TransformerConfig(**defaults)
+
+
+class TestHashRouting:
+    """Test hash-based MoE routing (_hash_routing, is_hash_layer, config validation)."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=1,
+            pipeline_model_parallel_size=1,
+            expert_model_parallel_size=1,
+        )
+        _set_random_seed(seed_=42, data_parallel_random_init=False)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    @pytest.mark.parametrize("score_function", ["softmax", "sigmoid", "sqrtsoftplus"])
+    def test_hash_routing_correctness(self, score_function):
+        """Verify expert selection matches tid2eid and scores are computed correctly."""
+        config = _hash_routing_config(moe_router_score_function=score_function)
+        pg_collection = get_default_pg_collection()
+        router = TopKRouter(config=config, pg_collection=pg_collection, layer_number=1)
+
+        num_tokens, num_experts = 16, 4
+        logits = torch.randn(num_tokens, num_experts, device="cuda")
+        input_ids = torch.randint(0, 128, (4, 4), device="cuda")
+
+        routing_probs, routing_map = router._hash_routing(logits, input_ids)
+
+        # Compute expected
+        if score_function == "softmax":
+            scores = torch.softmax(logits, dim=-1, dtype=torch.float32).type_as(logits)
+        elif score_function == "sigmoid":
+            scores = torch.sigmoid(logits.float()).type_as(logits)
+        else:
+            scores = torch.nn.functional.softplus(logits.float()).sqrt().type_as(logits)
+
+        flat_ids = input_ids.T.reshape(-1)
+        top_indices = router.tid2eid[flat_ids].long()
+        probs = scores.gather(1, top_indices)
+        if score_function != "softmax":
+            probs = probs / (probs.sum(dim=-1, keepdim=True) + 1e-20)
+
+        # Each token routed to exactly topk experts matching tid2eid
+        assert (routing_map.sum(dim=1) == router.topk).all()
+        for i in range(num_tokens):
+            actual = routing_map[i].nonzero(as_tuple=True)[0].sort().values
+            expected = top_indices[i].sort().values
+            assert torch.equal(actual, expected)
+            for k in range(router.topk):
+                expert_idx = top_indices[i, k].item()
+                assert torch.isclose(routing_probs[i, expert_idx], probs[i, k], atol=1e-5)
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_is_hash_layer_logic(self):
+        """Test layer boundary, MTP guard, and expert bias interaction."""
+        pg_collection = get_default_pg_collection()
+
+        # Boundary: layers within/beyond moe_n_hash_layers
+        config = _hash_routing_config(moe_n_hash_layers=2)
+        r1 = TopKRouter(config=config, pg_collection=pg_collection, layer_number=1)
+        r2 = TopKRouter(config=config, pg_collection=pg_collection, layer_number=2)
+        r3 = TopKRouter(config=config, pg_collection=pg_collection, layer_number=3)
+        assert r1.is_hash_layer is True and r1.tid2eid is not None
+        assert r2.is_hash_layer is True
+        assert r3.is_hash_layer is False and r3.tid2eid is None
+
+        # MTP layers bypass hash routing
+        mtp_router = TopKRouter(
+            config=config, pg_collection=pg_collection, layer_number=1, is_mtp_layer=True
+        )
+        assert mtp_router.is_hash_layer is False and mtp_router.tid2eid is None
+
+        # Expert bias disabled on hash layers
+        bias_config = _hash_routing_config(
+            moe_n_hash_layers=1,
+            moe_router_enable_expert_bias=True,
+            moe_router_score_function="sigmoid",
+        )
+        hash_r = TopKRouter(config=bias_config, pg_collection=pg_collection, layer_number=1)
+        normal_r = TopKRouter(config=bias_config, pg_collection=pg_collection, layer_number=2)
+        assert hash_r.enable_expert_bias is False
+        assert normal_r.enable_expert_bias is True
+
+    @pytest.mark.internal
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_moe_layer_hash_routing_integration(self):
+        """End-to-end MoELayer forward/backward with hash routing; raises without input_ids."""
+        config = _hash_routing_config(moe_n_hash_layers=1)
+        submodules = get_gpt_layer_local_submodules(
+            num_experts=config.num_moe_experts, moe_grouped_gemm=False
+        )
+        moe_layer = MoELayer(config, submodules.mlp.submodules, layer_number=1).cuda()
+
+        hidden_states = torch.randn(8, 2, 16, device="cuda", requires_grad=True)
+        input_ids = torch.randint(0, 128, (2, 8), device="cuda")
+
+        # Forward succeeds with input_ids
+        output, _ = moe_layer(hidden_states, input_ids=input_ids)
+        assert output.shape == hidden_states.shape
+        assert not torch.isnan(output).any()
+
+        # Backward succeeds
+        output.sum().backward()
+        assert hidden_states.grad is not None
+        assert not torch.isnan(hidden_states.grad).any()
diff --git a/tests/unit_tests/transformer/moe/test_token_dispatcher.py b/tests/unit_tests/transformer/moe/test_token_dispatcher.py
index 6ff8fcdc6e5..dead2c0c12d 100644
--- a/tests/unit_tests/transformer/moe/test_token_dispatcher.py
+++ b/tests/unit_tests/transformer/moe/test_token_dispatcher.py
@@ -1,7 +1,8 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import copy
 import dataclasses
+from types import SimpleNamespace
 
 import pytest
 import torch
@@ -10,6 +11,7 @@
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_submodules
 from megatron.core.transformer.moe.moe_layer import MoELayer
 from megatron.core.transformer.moe.moe_utils import get_capacity
+from megatron.core.transformer.moe.token_dispatcher import MoETokenDispatcher
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.typed_torch import apply_module
 from megatron.core.utils import is_te_min_version
@@ -33,6 +35,48 @@ def token_unpermutation(token_dispatcher, hidden_states):
     return hidden_states, None
 
 
+class _NestedAttrTestDispatcher(MoETokenDispatcher):
+    def dispatch_preprocess(self, tokens, routing_map, probs):
+        raise NotImplementedError
+
+    def token_dispatch(self, hidden_states, probs):
+        raise NotImplementedError
+
+    def dispatch_postprocess(self, hidden_states, probs):
+        raise NotImplementedError
+
+    def combine_preprocess(self, hidden_states):
+        raise NotImplementedError
+
+    def token_combine(self, hidden_states):
+        raise NotImplementedError
+
+    def combine_postprocess(self, hidden_states):
+        raise NotImplementedError
+
+
+def test_get_cudagraph_attr_supports_nested_paths():
+    dispatcher = object.__new__(_NestedAttrTestDispatcher)
+    token_probs = torch.randn(2, 3)
+    dispatcher._comm_manager = SimpleNamespace(
+        token_probs=token_probs, nested=SimpleNamespace(routing_map=torch.randn(2, 4))
+    )
+
+    assert dispatcher.get_cudagraph_attr("_comm_manager.token_probs") is token_probs
+    assert dispatcher.get_cudagraph_attr("_comm_manager.nested.routing_map") is not None
+    assert dispatcher.get_cudagraph_attr("_comm_manager.missing_attr") is None
+
+
+def test_set_cudagraph_attr_supports_nested_paths():
+    dispatcher = object.__new__(_NestedAttrTestDispatcher)
+    dispatcher._comm_manager = SimpleNamespace(routing_map=None)
+    routing_map = torch.randn(4, 5)
+
+    dispatcher.set_cudagraph_attr("_comm_manager.routing_map", routing_map)
+
+    assert dispatcher._comm_manager.routing_map is routing_map
+
+
 class MoEModelTestContainer:
     def __init__(
         self,
@@ -103,8 +147,11 @@ def new_moe_layer(self, **kargs):
             num_experts=self.config.num_moe_experts, moe_grouped_gemm=self.config.moe_grouped_gemm
         )
         new_config = dataclasses.replace(self.config, **kargs)
-        moe_layer = MoELayer(new_config, submodules.mlp.submodules).cuda().to(dtype=self.test_dtype)
-        moe_layer.set_layer_number(0)
+        moe_layer = (
+            MoELayer(new_config, submodules.mlp.submodules, layer_number=0)
+            .cuda()
+            .to(dtype=self.test_dtype)
+        )
         return moe_layer
 
     def __del__(self):
@@ -430,11 +477,24 @@ def teardown_method(self, method):
     @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2)])
     @pytest.mark.parametrize("permute_fusion", permute_fusion_params)
     @pytest.mark.parametrize("moe_flex_dispatcher_backend", ["deepep", "hybridep"])
-    def test_forward_backward(self, tp_size, ep_size, permute_fusion, moe_flex_dispatcher_backend):
+    @pytest.mark.parametrize("moe_permute_fusion_into_hybridep", [True, False])
+    def test_forward_backward(
+        self,
+        tp_size,
+        ep_size,
+        permute_fusion,
+        moe_flex_dispatcher_backend,
+        moe_permute_fusion_into_hybridep,
+    ):
         if moe_flex_dispatcher_backend == "deepep" and not is_deep_ep_available():
             pytest.skip("Deep EP is not available")
         if moe_flex_dispatcher_backend == "hybridep" and not is_hybrid_ep_available():
             pytest.skip("Hybrid EP is not available")
+        if moe_permute_fusion_into_hybridep:
+            if permute_fusion or moe_flex_dispatcher_backend != "hybridep":
+                pytest.skip(
+                    "moe_permute_fusion_into_hybridep skipped because permute_fusion or hybridep is not set"
+                )
         if permute_fusion:
             config.ENABLE_EXPERIMENTAL = True
         container = MoEModelTestContainer(
@@ -448,6 +508,7 @@ def test_forward_backward(self, tp_size, ep_size, permute_fusion, moe_flex_dispa
             moe_permute_fusion=permute_fusion,
             hidden_size=1024,
             moe_flex_dispatcher_backend=moe_flex_dispatcher_backend,
+            moe_permute_fusion_into_hybridep=moe_permute_fusion_into_hybridep,
             test_dtype=torch.bfloat16,
         )
         container.dispatcher_dropless_test()
@@ -460,13 +521,24 @@ def test_forward_backward(self, tp_size, ep_size, permute_fusion, moe_flex_dispa
     @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2)])
     @pytest.mark.parametrize("permute_fusion", permute_fusion_params)
     @pytest.mark.parametrize("moe_flex_dispatcher_backend", ["deepep", "hybridep"])
+    @pytest.mark.parametrize("moe_permute_fusion_into_hybridep", [True, False])
     def test_capacity_forward_backward(
-        self, tp_size, ep_size, permute_fusion, moe_flex_dispatcher_backend
+        self,
+        tp_size,
+        ep_size,
+        permute_fusion,
+        moe_flex_dispatcher_backend,
+        moe_permute_fusion_into_hybridep,
     ):
         if moe_flex_dispatcher_backend == "deepep" and not is_deep_ep_available():
             pytest.skip("Deep EP is not available")
         if moe_flex_dispatcher_backend == "hybridep" and not is_hybrid_ep_available():
             pytest.skip("Hybrid EP is not available")
+        if moe_permute_fusion_into_hybridep:
+            if permute_fusion or moe_flex_dispatcher_backend != "hybridep":
+                pytest.skip(
+                    "moe_permute_fusion_into_hybridep skipped because permute_fusion or hybridep is not set"
+                )
         if permute_fusion:
             config.ENABLE_EXPERIMENTAL = True
         container = MoEModelTestContainer(
@@ -483,6 +555,7 @@ def test_capacity_forward_backward(
             moe_permute_fusion=permute_fusion,
             hidden_size=1024,
             moe_flex_dispatcher_backend=moe_flex_dispatcher_backend,
+            moe_permute_fusion_into_hybridep=moe_permute_fusion_into_hybridep,
             test_dtype=torch.bfloat16,
         )
         container.dispatcher_capacity_test()
@@ -497,13 +570,24 @@ def test_capacity_forward_backward(
     @pytest.mark.parametrize("tp_size,ep_size", [(1, 8), (8, 1), (4, 2)])
     @pytest.mark.parametrize("permute_fusion", [True])
     @pytest.mark.parametrize("moe_flex_dispatcher_backend", ["deepep", "hybridep"])
+    @pytest.mark.parametrize("moe_permute_fusion_into_hybridep", [True, False])
     def test_router_padding_for_fp8_forward_backward(
-        self, tp_size, ep_size, permute_fusion, moe_flex_dispatcher_backend
+        self,
+        tp_size,
+        ep_size,
+        permute_fusion,
+        moe_flex_dispatcher_backend,
+        moe_permute_fusion_into_hybridep,
     ):
         if moe_flex_dispatcher_backend == "deepep" and not is_deep_ep_available():
             pytest.skip("Deep EP is not available")
         if moe_flex_dispatcher_backend == "hybridep" and not is_hybrid_ep_available():
             pytest.skip("Hybrid EP is not available")
+        if moe_permute_fusion_into_hybridep:
+            if permute_fusion or moe_flex_dispatcher_backend != "hybridep":
+                pytest.skip(
+                    "moe_permute_fusion_into_hybridep skipped because permute_fusion or hybridep is not set"
+                )
         if permute_fusion:
             config.ENABLE_EXPERIMENTAL = True
         container = MoEModelTestContainer(
@@ -518,6 +602,7 @@ def test_router_padding_for_fp8_forward_backward(
             moe_permute_fusion=permute_fusion,
             hidden_size=1024,
             moe_flex_dispatcher_backend=moe_flex_dispatcher_backend,
+            moe_permute_fusion_into_hybridep=moe_permute_fusion_into_hybridep,
             test_dtype=torch.bfloat16,
         )
         container.dispatcher_router_padding_for_fp8_test()
diff --git a/tests/unit_tests/transformer/test_attention.py b/tests/unit_tests/transformer/test_attention.py
index ab5a33aa61b..03521a30384 100644
--- a/tests/unit_tests/transformer/test_attention.py
+++ b/tests/unit_tests/transformer/test_attention.py
@@ -1,11 +1,13 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 import copy
 from unittest import mock
 
+import einops
 import pytest
 import torch
 from packaging import version
+from torch.nn import functional as F
 
 import megatron.core.parallel_state as parallel_state
 from megatron.core.hyper_comm_grid import HyperCommGrid
@@ -13,6 +15,7 @@
     get_pos_emb_on_this_cp_rank as get_tensor_on_this_cp_rank,
 )
 from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_local_spec,
     get_gpt_layer_with_transformer_engine_spec,
     get_gpt_layer_with_transformer_engine_submodules,
 )
@@ -21,6 +24,10 @@
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
 from megatron.core.transformer import TransformerConfig
 from megatron.core.transformer.attention import SelfAttention
+from megatron.core.transformer.dot_product_attention_context_parallel import (
+    AttentionFuncionWithContextParallel,
+    to_zz_mask_attn_bias,
+)
 from megatron.core.transformer.enums import AttnMaskType
 from megatron.core.utils import is_te_min_version
 from megatron.training.arguments import parse_args
@@ -34,6 +41,7 @@
     init_checkpointing_mock_args,
 )
 from tests.unit_tests.test_utilities import Utils
+from tests.unit_tests.transformer.test_multi_latent_attention import make_test_packed_seq_params
 
 try:
     from transformer_engine.pytorch.attention.rope import apply_fused_qkv_rotary_pos_emb
@@ -44,10 +52,19 @@
 
 
 @pytest.mark.parametrize("output_gate", [False, True])
+@pytest.mark.parametrize(
+    ("transformer_impl", "fallback_to_eager_attn"),
+    [("transformer_engine", False), ("transformer_engine", True), ("native", False)],
+)
 class TestParallelAttention:
 
     @pytest.fixture(scope='function', autouse=True)
-    def setup_method(self, output_gate):
+    def setup_method(self, output_gate, transformer_impl, fallback_to_eager_attn):
+        if output_gate:
+            if transformer_impl == "native":
+                pytest.skip("Native implementation does not support output gate.")
+            if fallback_to_eager_attn:
+                pytest.skip("No need to test output gate for fallback_to_eager_attn = True.")
         Utils.initialize_model_parallel(1, 1)
         model_parallel_cuda_manual_seed(123)
         self.transformer_config = TransformerConfig(
@@ -58,11 +75,17 @@ def setup_method(self, output_gate):
             bf16=True,
             params_dtype=torch.bfloat16,
             attention_output_gate=output_gate,
+            transformer_impl=transformer_impl,
+            fallback_to_eager_attn=fallback_to_eager_attn,
         )
+        if transformer_impl == "transformer_engine":
+            attn_layer_spec = (
+                get_gpt_layer_with_transformer_engine_submodules().self_attention.submodules
+            )
+        else:
+            attn_layer_spec = get_gpt_layer_local_spec().submodules.self_attention.submodules
         self.parallel_attention = SelfAttention(
-            self.transformer_config,
-            get_gpt_layer_with_transformer_engine_submodules().self_attention.submodules,
-            layer_number=1,
+            self.transformer_config, attn_layer_spec, layer_number=1
         )
 
     def teardown_method(self):
@@ -73,10 +96,19 @@ def test_constructor(self):
         assert self.parallel_attention.layer_number == 1
 
         num_weights = sum([p.numel() for p in self.parallel_attention.parameters()])
+
+        hidden_size = self.transformer_config.hidden_size
+        standard_num_weights = (
+            hidden_size * hidden_size * 4 + hidden_size * 4  # QKVO weight  # QKVO bias
+        )
         if self.transformer_config.attention_output_gate:
-            assert num_weights == 82816
-        else:
-            assert num_weights == 66304
+            standard_num_weights += hidden_size * hidden_size + hidden_size  # Gate weight and bias
+        if self.transformer_config.transformer_impl == "transformer_engine":
+            standard_num_weights += hidden_size * 2  # fused pre layernorm weight and bias
+
+        assert (
+            num_weights == standard_num_weights
+        ), f"{num_weights=} does not match {standard_num_weights=}."
 
     def test_cpu_forward(self):
         # we can't currently do this because the global memory buffer is on GPU
@@ -111,6 +143,8 @@ def test_gpu_forward(self):
     @pytest.mark.parametrize("rotary_interleaved", [True, False])
     @pytest.mark.parametrize("fused_qkv_rope", [True, False])
     def test_fused_rope_gpu_forward(self, rotary_interleaved, fused_qkv_rope):
+        if self.transformer_config.fallback_to_eager_attn:
+            pytest.skip("No need to test fused RoPE for fallback_to_eager_attn = True.")
         self.parallel_attention.config.apply_rope_fusion = True
         if rotary_interleaved and not is_te_min_version("2.3.0"):
             pytest.skip("Only TE >= 2.3.0 supports interleaved fused RoPE.")
@@ -374,6 +408,199 @@ class TestSelfAttention:
 
     @pytest.fixture(scope='function', autouse=True)
     def setup_method(self, output_gate):
+        self.output_gate = output_gate
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self):
+        Utils.destroy_model_parallel()
+
+    def test_clip_qk_disabled_raises_error(self):
+        """Test that clip_qk raises ValueError when qk_clip is not enabled."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=False,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+
+        with pytest.raises(ValueError, match="qk_clip option needs to be enabled"):
+            attention.clip_qk()
+
+    def test_clip_qk_none_logits_raises_error(self):
+        """Test that clip_qk raises ValueError when current_max_attn_logits is None."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+
+        with pytest.raises(ValueError, match="current_max_attn_logits is None"):
+            attention.clip_qk()
+
+    def test_clip_qk_below_threshold_no_update(self):
+        """Test that weights are not updated when max logits are below threshold."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        attention.cuda()
+
+        # Save original weights
+        original_weight = attention.linear_qkv.weight.data.clone()
+
+        # Set current_max_attn_logits below threshold
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [50.0, 60.0, 70.0, 80.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should not be updated
+        assert torch.equal(attention.linear_qkv.weight.data, original_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_above_threshold_updates_weights(self):
+        """Test that weights are updated when max logits exceed threshold."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        attention.cuda()
+
+        # Save original weights
+        original_weight = attention.linear_qkv.weight.data.clone()
+
+        # Set current_max_attn_logits above threshold
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [150.0, 160.0, 170.0, 180.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should be updated
+        assert not torch.equal(attention.linear_qkv.weight.data, original_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_gqa_configuration(self):
+        """Test clip_qk with GQA (Grouped Query Attention) configuration."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=8,
+            num_query_groups=4,  # GQA with 2 heads per group
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        attention.cuda()
+
+        # Save original weights
+        original_weight = attention.linear_qkv.weight.data.clone()
+
+        # Set current_max_attn_logits for all heads (8 heads)
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [150.0, 160.0, 170.0, 180.0, 190.0, 200.0, 210.0, 220.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should be updated
+        assert not torch.equal(attention.linear_qkv.weight.data, original_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+    def test_clip_qk_mixed_logits(self):
+        """Test clip_qk with mixed logits (some above, some below threshold)."""
+        transformer_config = TransformerConfig(
+            num_layers=2,
+            hidden_size=128,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            qk_clip=True,
+            qk_clip_threshold=100.0,
+            qk_clip_alpha=0.5,
+        )
+        attention = SelfAttention(
+            transformer_config,
+            get_gpt_layer_with_transformer_engine_spec().submodules.self_attention.submodules,
+            layer_number=1,
+        )
+        attention.cuda()
+
+        # Save original weights
+        original_weight = attention.linear_qkv.weight.data.clone()
+
+        # Set mixed current_max_attn_logits (some above, some below threshold)
+        attention.core_attention.current_max_attn_logits = torch.tensor(
+            [80.0, 150.0, 90.0, 200.0], device='cuda'
+        )
+
+        # Call clip_qk
+        attention.clip_qk()
+
+        # Weights should be updated since at least one head exceeds threshold
+        assert not torch.equal(attention.linear_qkv.weight.data, original_weight)
+        # current_max_attn_logits should be reset
+        assert attention.core_attention.current_max_attn_logits is None
+
+
+@pytest.mark.parametrize("output_gate", [False, True])
+@pytest.mark.parametrize("transformer_impl", ["transformer_engine", "native"])
+class TestSelfAttention:
+
+    @pytest.fixture(scope='function', autouse=True)
+    def setup_method(self, output_gate, transformer_impl):
+        if transformer_impl == "native":
+            if output_gate:
+                pytest.skip("Native implementation does not support output gate.")
+        self.transformer_impl = transformer_impl
         self.output_gate = output_gate
         Utils.destroy_model_parallel()
 
@@ -389,10 +616,17 @@ def run_self_attention(self, pg_collection):
             attention_output_gate=self.output_gate,
             tensor_model_parallel_size=tensor_model_parallel_size,
             use_cpu_initialization=False,
+            transformer_impl=self.transformer_impl,
         )
+        if self.transformer_impl == "transformer_engine":
+            attn_layer_spec = (
+                get_gpt_layer_with_transformer_engine_submodules().self_attention.submodules
+            )
+        else:
+            attn_layer_spec = get_gpt_layer_local_spec().submodules.self_attention.submodules
         self.self_attention = SelfAttention(
             self.transformer_config,
-            get_gpt_layer_with_transformer_engine_submodules().self_attention.submodules,
+            attn_layer_spec,
             layer_number=1,
             attn_mask_type=AttnMaskType.causal,
             pg_collection=pg_collection,
@@ -479,6 +713,7 @@ def _test_parallel_attention_correctness(
     seed=123,
     sequence_length=256,
     micro_batch_size=4,
+    sequence_packing=False,
 ):
     # Model initialization function
     def initialize_gpt_model(
@@ -572,17 +807,24 @@ def initialize_gpt_model(
         def get_tensor_on_this_rank(tensor):
             if cp > 1:
                 tensor = get_tensor_on_this_cp_rank(tensor, 0, cp_group)
+            if sequence_packing:
+                tensor = tensor.transpose(0, 1).contiguous().view(-1, 1, *tensor.shape[2:])
             if tp > 1 and sp:
-                sp_seg = sequence_length // tp // cp
+                sp_seg = tensor.shape[0] // tp
                 tensor = tensor[tp_rank * sp_seg : (tp_rank + 1) * sp_seg]
             return tensor
 
         # Calculate parallel model output
+        if sequence_packing:
+            cu_seqlens = [i * sequence_length for i in range(micro_batch_size + 1)]
+            packed_seq_params = make_test_packed_seq_params(cu_seqlens=cu_seqlens)
+        else:
+            packed_seq_params = None
         input_hidden_states = get_tensor_on_this_rank(input_hidden_states)
         input_hidden_states = input_hidden_states.detach().requires_grad_(True)
         parallel_attention = gpt_model[0].decoder.layers[0].self_attention
         output_hidden_states_parallel, bias_hidden_states_parallel = parallel_attention(
-            input_hidden_states, attention_mask=None
+            input_hidden_states, attention_mask=None, packed_seq_params=packed_seq_params
         )
         output_hidden_states_parallel.sum().backward()
         input_grad_parallel = input_hidden_states.grad.detach()
@@ -647,6 +889,8 @@ def get_tensor_on_this_rank(tensor):
         Utils.destroy_model_parallel()
 
 
+# TODO(yuzhongw): Add test case for fallback_to_eager_attn
+@pytest.mark.parametrize("sequence_packing", [False, True])
 @pytest.mark.parametrize("apply_rope_fusion", [False, True])
 @pytest.mark.parametrize(
     ("tp", "sp", "cp"),
@@ -661,7 +905,7 @@ def get_tensor_on_this_rank(tensor):
 @pytest.mark.parametrize("qk_layernorm", [False, True])
 @pytest.mark.parametrize("output_gate", [False, True])
 def test_parallel_attention_correctness(
-    tmp_path_dist_ckpt, apply_rope_fusion, tp, sp, cp, qk_layernorm, output_gate
+    tmp_path_dist_ckpt, sequence_packing, apply_rope_fusion, tp, sp, cp, qk_layernorm, output_gate
 ):
     transformer_config = TransformerConfig(
         num_layers=1,
@@ -690,6 +934,7 @@ def test_parallel_attention_correctness(
         cp=cp,
         seed=123,
         sequence_length=256,
+        sequence_packing=sequence_packing,
     )
 
 
diff --git a/tests/unit_tests/transformer/test_cuda_graphs.py b/tests/unit_tests/transformer/test_cuda_graphs.py
index ee4ff7d152d..01150d65570 100644
--- a/tests/unit_tests/transformer/test_cuda_graphs.py
+++ b/tests/unit_tests/transformer/test_cuda_graphs.py
@@ -928,7 +928,7 @@ def create_test_args(
             args.num_layers_at_end_in_bf16 = 1
 
         for key, value in kwargs.items():
-            assert hasattr(args, key)
+            assert hasattr(args, key) or hasattr(TransformerConfig, key), f"Unknown argument: {key}"
             setattr(args, key, value)
 
         validate_args(args)
@@ -1087,6 +1087,63 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa
             reset_hybrid_ep_buffer()
         Utils.destroy_model_parallel()
 
+    @pytest.mark.flaky
+    @pytest.mark.flaky_in_dev
+    @pytest.mark.skipif(
+        not (HAVE_TE and is_te_min_version("2.10.0")),
+        reason="Partial CUDA graph UT support requires TransformerEngine version >= 2.10.0",
+    )
+    @pytest.mark.parametrize("ep_size", [1, 4])
+    def test_mhc_moe_partial_cudagraph(self, ep_size):
+        """Test that mHC (Hyper Connection) layers produce identical loss curves
+        with and without TE partial CUDA graph capture.
+
+        This validates the fix where HyperConnectionTransformerLayer overrides
+        _te_cuda_graph_replay_impl (not _te_cuda_graph_replay) so that the parent's
+        delay_offload_until_cuda_graph lifecycle and overlap_moe_expert_parallel_comm
+        handling are preserved.
+        """
+        initialize_rng_tracker(use_te_rng_tracker=True, force_reset=True)
+        Utils.initialize_model_parallel(
+            tensor_model_parallel_size=self.tp_size,
+            context_parallel_size=self.cp_size,
+            pipeline_model_parallel_size=1,
+            expert_tensor_parallel_size=1 if ep_size > 1 else self.tp_size,
+            expert_model_parallel_size=ep_size,
+        )
+
+        extra_kwargs = {
+            "enable_hyper_connections": True,
+            "num_residual_streams": 4,
+            "mtp_num_layers": None,  # mHC is incompatible with MTP
+        }
+
+        loss_list_ref = self._run_test_helper(ep_size, "none", None, 0, **extra_kwargs)
+        for cuda_graph_scope in [
+            [CudaGraphScope.attn],
+            [CudaGraphScope.mlp, CudaGraphScope.moe_router],
+            [
+                CudaGraphScope.attn,
+                CudaGraphScope.mlp,
+                CudaGraphScope.moe_router,
+                CudaGraphScope.moe_preprocess,
+            ],
+        ]:
+            cuda_graph_warmup_steps = 3
+            loss_list = self._run_test_helper(
+                ep_size,
+                "transformer_engine",
+                cuda_graph_scope,
+                cuda_graph_warmup_steps,
+                **extra_kwargs,
+            )
+            assert torch.equal(loss_list, loss_list_ref), (
+                f"mHC loss mismatch with cuda_graph_scope={cuda_graph_scope}, ep_size={ep_size}. "
+                f"Max diff: {torch.max(torch.abs(loss_list - loss_list_ref))}"
+            )
+
+        Utils.destroy_model_parallel()
+
 
 if __name__ == "__main__":
 
@@ -1104,3 +1161,8 @@ def test_moe_partial_cudagraph(self, ep_size, moe_dropless_dispatcher, moe_dispa
     test.setup_method(method=None)
     test.test_moe_partial_cudagraph(4, True, "alltoall")
     test.teardown_method(method=None)
+
+    test = TestPartialCudaGraph()
+    test.setup_method(method=None)
+    test.test_mhc_moe_partial_cudagraph(4)
+    test.teardown_method(method=None)
diff --git a/tests/unit_tests/transformer/test_hyper_connection_recompute.py b/tests/unit_tests/transformer/test_hyper_connection_recompute.py
new file mode 100644
index 00000000000..cf44f2d7cd0
--- /dev/null
+++ b/tests/unit_tests/transformer/test_hyper_connection_recompute.py
@@ -0,0 +1,408 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""
+Unit tests for HyperConnection block-level recomputation.
+
+Tests the following functionality:
+1. HyperConnectionModule._forward_with_checkpoint correctness
+2. HyperConnectionModule.apply_h_post with CheckpointManager
+3. Multiple HyperConnectionModules chained with a single CheckpointManager
+4. Partial checkpoint (last layer not checkpointed)
+5. TransformerConfig 'mhc' in recompute_modules option
+"""
+
+import pytest
+import torch
+
+from megatron.core.tensor_parallel.random import CheckpointManager, model_parallel_cuda_manual_seed
+from megatron.core.transformer.hyper_connection import HyperConnectionModule
+from megatron.core.transformer.transformer_config import TransformerConfig
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestHyperConnectionCheckpoint:
+    """Test HyperConnectionModule checkpoint functionality."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def _create_hyper_connection_module(self, hidden_size=64, num_residual_streams=4):
+        """Create a HyperConnectionModule for testing."""
+        config = TransformerConfig(
+            num_layers=2,
+            hidden_size=hidden_size,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            enable_hyper_connections=True,
+            num_residual_streams=num_residual_streams,
+            mhc_sinkhorn_iterations=5,  # Fewer iterations for faster tests
+            mhc_init_gating_factor=0.01,
+        )
+        module = HyperConnectionModule(config=config, layer_number=1)
+        module.cuda()
+        return module
+
+    def test_forward_normal_vs_checkpoint_correctness(self):
+        """
+        Test that _forward_with_checkpoint produces the same outputs as _forward_normal.
+        """
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+
+        module = self._create_hyper_connection_module(hidden_size, num_streams)
+
+        # Create input tensors
+        hidden_states = torch.randn(
+            seq_len, batch_size, num_streams * hidden_size, device='cuda', requires_grad=True
+        )
+        residual = torch.randn(
+            seq_len, batch_size, num_streams * hidden_size, device='cuda', requires_grad=True
+        )
+
+        # Clone inputs for comparison
+        hidden_states_ckpt = hidden_states.detach().clone().requires_grad_(True)
+        residual_ckpt = residual.detach().clone().requires_grad_(True)
+
+        # Forward without checkpoint (reference)
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+        aggregated_ref, h_res_ref, h_post_ref = module._forward_normal(hidden_states)
+        mixed_ref = module.apply_h_res(h_res_ref, residual)
+        loss_ref = aggregated_ref.sum() + mixed_ref.sum() + h_post_ref.sum()
+        loss_ref.backward()
+        grad_hidden_ref = hidden_states.grad.clone()
+        grad_residual_ref = residual.grad.clone()
+
+        # Forward with checkpoint
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+        manager = CheckpointManager()
+        aggregated_ckpt, h_res_ckpt, h_post_ckpt = module._forward_with_checkpoint(
+            hidden_states_ckpt, manager
+        )
+        mixed_ckpt = module.apply_h_res(h_res_ckpt, residual_ckpt)
+        # Calculate loss before discarding outputs
+        loss_ckpt = aggregated_ckpt.sum() + mixed_ckpt.sum() + h_post_ckpt.sum()
+
+        # Register unified recompute hook
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+
+        # Backward pass
+        loss_ckpt.backward()
+        grad_hidden_ckpt = hidden_states_ckpt.grad.clone()
+        grad_residual_ckpt = residual_ckpt.grad.clone()
+
+        # Verify gradients match
+        assert torch.allclose(grad_hidden_ckpt, grad_hidden_ref, atol=1e-5), (
+            f"Hidden states gradients mismatch:\n"
+            f"Checkpoint: {grad_hidden_ckpt}\n"
+            f"Reference: {grad_hidden_ref}"
+        )
+        assert torch.allclose(grad_residual_ckpt, grad_residual_ref, atol=1e-5), (
+            f"Residual gradients mismatch:\n"
+            f"Checkpoint: {grad_residual_ckpt}\n"
+            f"Reference: {grad_residual_ref}"
+        )
+
+    def test_apply_h_post_with_checkpoint(self):
+        """
+        Test that apply_h_post with manager produces correct gradients.
+        """
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+
+        module = self._create_hyper_connection_module(hidden_size, num_streams)
+
+        # Create input tensors
+        x = torch.randn(seq_len, batch_size, hidden_size, device='cuda', requires_grad=True)
+        bias = torch.randn(hidden_size, device='cuda')
+        h_post = torch.randn(seq_len, batch_size, num_streams, device='cuda', requires_grad=True)
+
+        # Clone inputs
+        x_ckpt = x.detach().clone().requires_grad_(True)
+        h_post_ckpt = h_post.detach().clone().requires_grad_(True)
+
+        # Reference: without checkpoint (manager=None)
+        torch.manual_seed(42)
+        x_out_ref, bias_out_ref = module.apply_h_post((x, bias), h_post, manager=None)
+        loss_ref = x_out_ref.sum()
+        if bias_out_ref is not None:
+            loss_ref = loss_ref + bias_out_ref.sum()
+        loss_ref.backward()
+        grad_x_ref = x.grad.clone()
+        grad_h_post_ref = h_post.grad.clone()
+
+        # With checkpoint (manager provided)
+        torch.manual_seed(42)
+        manager = CheckpointManager()
+        x_out_ckpt, bias_out_ckpt = module.apply_h_post(
+            (x_ckpt, bias), h_post_ckpt, manager=manager
+        )
+        loss_ckpt = x_out_ckpt.sum()
+        if bias_out_ckpt is not None:
+            loss_ckpt = loss_ckpt + bias_out_ckpt.sum()
+
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+        loss_ckpt.backward()
+        grad_x_ckpt = x_ckpt.grad.clone()
+        grad_h_post_ckpt = h_post_ckpt.grad.clone()
+
+        # Verify gradients
+        assert torch.allclose(grad_x_ckpt, grad_x_ref, atol=1e-5)
+        assert torch.allclose(grad_h_post_ckpt, grad_h_post_ref, atol=1e-5)
+
+    def test_forward_with_manager_parameter(self):
+        """
+        Test forward() method with mhc_recompute_manager parameter.
+        """
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+
+        module = self._create_hyper_connection_module(hidden_size, num_streams)
+
+        # Create input tensors
+        hidden_states = torch.randn(
+            seq_len, batch_size, num_streams * hidden_size, device='cuda', requires_grad=True
+        )
+
+        # Clone inputs
+        hidden_states_ckpt = hidden_states.detach().clone().requires_grad_(True)
+
+        # Reference: forward without manager (uses _forward_normal)
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+        aggregated_ref, h_res_ref, h_post_ref = module.forward(
+            hidden_states, mhc_recompute_manager=None
+        )
+        loss_ref = aggregated_ref.sum() + h_res_ref.sum() + h_post_ref.sum()
+        loss_ref.backward()
+        grad_hidden_ref = hidden_states.grad.clone()
+
+        # With manager (uses _forward_with_checkpoint)
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+        manager = CheckpointManager()
+        aggregated_ckpt, h_res_ckpt, h_post_ckpt = module.forward(
+            hidden_states_ckpt, mhc_recompute_manager=manager
+        )
+        loss_ckpt = aggregated_ckpt.sum() + h_res_ckpt.sum() + h_post_ckpt.sum()
+
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+        loss_ckpt.backward()
+        grad_hidden_ckpt = hidden_states_ckpt.grad.clone()
+
+        # Verify gradients match
+        assert torch.allclose(grad_hidden_ckpt, grad_hidden_ref, atol=1e-5)
+
+
+class TestMHCBlockRecomputeIntegration:
+    """Test CheckpointManager integration with HyperConnection."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_multiple_hyper_connections_in_chain(self):
+        """
+        Test that multiple HyperConnectionModules can be chained together
+        with a single CheckpointManager.
+        """
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+        n_channels = num_streams * hidden_size
+
+        # Create multiple HyperConnection modules (simulating multiple layers)
+        config = TransformerConfig(
+            num_layers=4,
+            hidden_size=hidden_size,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            enable_hyper_connections=True,
+            num_residual_streams=num_streams,
+            mhc_sinkhorn_iterations=5,
+            mhc_init_gating_factor=0.01,
+        )
+
+        modules = [
+            HyperConnectionModule(config=config, layer_number=i + 1).cuda() for i in range(3)
+        ]
+
+        # Create input tensors
+        hidden_states_ref = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        residual_ref = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+
+        hidden_states_ckpt = hidden_states_ref.detach().clone().requires_grad_(True)
+        residual_ckpt = residual_ref.detach().clone().requires_grad_(True)
+
+        # Reference: forward without checkpoint
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+
+        h = hidden_states_ref
+        r = residual_ref
+        for module in modules:
+            agg, h_res, h_post = module.forward(h, mhc_recompute_manager=None)
+            agg, _ = module.apply_h_post((0.1 * agg, None), h_post, manager=None)
+            mixed = module.apply_h_res(h_res, r)  # Apply h_res to get mixed [s, b, n*C]
+            h = agg + mixed
+            r = h
+
+        loss_ref = h.sum()
+        loss_ref.backward()
+        grad_hidden_ref = hidden_states_ref.grad.clone()
+        grad_residual_ref = residual_ref.grad.clone()
+
+        # With checkpoint using single manager
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+
+        manager = CheckpointManager()
+
+        h = hidden_states_ckpt
+        r = residual_ckpt
+        for module in modules:
+            agg, h_res, h_post = module.forward(h, mhc_recompute_manager=manager)
+            agg, _ = module.apply_h_post((0.1 * agg, None), h_post, manager=manager)
+            mixed = module.apply_h_res(h_res, r)  # Apply h_res to get mixed [s, b, n*C]
+            h = agg + mixed
+            r = h
+
+        loss_ckpt = h.sum()
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+        loss_ckpt.backward()
+
+        grad_hidden_ckpt = hidden_states_ckpt.grad.clone()
+        grad_residual_ckpt = residual_ckpt.grad.clone()
+
+        # Verify gradients
+        assert torch.allclose(
+            grad_hidden_ckpt, grad_hidden_ref, atol=1e-4
+        ), f"Chained HyperConnection hidden gradients mismatch"
+        assert torch.allclose(
+            grad_residual_ckpt, grad_residual_ref, atol=1e-4
+        ), f"Chained HyperConnection residual gradients mismatch"
+
+    def test_partial_checkpoint_last_layer_not_checkpointed(self):
+        """
+        Test that when is_last_layer_in_block=True, the final output is NOT checkpointed.
+        This simulates the TransformerBlock behavior where the last layer's MLP BDA
+        serves as the hook_tensor for unified recompute.
+        """
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+
+        config = TransformerConfig(
+            num_layers=2,
+            hidden_size=hidden_size,
+            num_attention_heads=4,
+            use_cpu_initialization=True,
+            enable_hyper_connections=True,
+            num_residual_streams=num_streams,
+            mhc_sinkhorn_iterations=5,
+            mhc_init_gating_factor=0.01,
+        )
+
+        module = HyperConnectionModule(config=config, layer_number=1).cuda()
+
+        hidden_states_ref = torch.randn(
+            seq_len, batch_size, num_streams * hidden_size, device='cuda', requires_grad=True
+        )
+        residual_ref = torch.randn(
+            seq_len, batch_size, num_streams * hidden_size, device='cuda', requires_grad=True
+        )
+
+        hidden_states_ckpt = hidden_states_ref.detach().clone().requires_grad_(True)
+        residual_ckpt = residual_ref.detach().clone().requires_grad_(True)
+
+        # Reference
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+        aggregated_ref, h_res_ref, h_post_ref = module.forward(
+            hidden_states_ref, mhc_recompute_manager=None
+        )
+        aggregated_ref, _ = module.apply_h_post(
+            (0.1 * aggregated_ref, None), h_post_ref, manager=None
+        )
+        mixed_ref = module.apply_h_res(
+            h_res_ref, residual_ref
+        )  # Apply h_res to get mixed [s, b, n*C]
+        # Simulate BDA that is NOT checkpointed (last layer)
+        output_ref = aggregated_ref + 0.5 * mixed_ref
+        loss_ref = output_ref.sum()
+        loss_ref.backward()
+        grad_hidden_ref = hidden_states_ref.grad.clone()
+
+        # With manager - checkpoint everything except final output
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+        manager = CheckpointManager()
+        aggregated_ckpt, h_res_ckpt, h_post_ckpt = module.forward(
+            hidden_states_ckpt, mhc_recompute_manager=manager
+        )
+
+        aggregated_ckpt, _ = module.apply_h_post(
+            (0.1 * aggregated_ckpt, None), h_post_ckpt, manager=manager
+        )
+        mixed_ckpt = module.apply_h_res(
+            h_res_ckpt, residual_ckpt
+        )  # Apply h_res to get mixed [s, b, n*C]
+        # Simulate BDA that is NOT checkpointed (last layer) - this is the hook_tensor
+        output_ckpt = aggregated_ckpt + 0.5 * mixed_ckpt
+
+        # Register unified recompute on the output (which is not checkpointed)
+        manager.discard_all_outputs_and_register_unified_recompute(output_ckpt)
+
+        loss_ckpt = output_ckpt.sum()
+        loss_ckpt.backward()
+        grad_hidden_ckpt = hidden_states_ckpt.grad.clone()
+
+        # Verify gradients match
+        assert torch.allclose(grad_hidden_ckpt, grad_hidden_ref, atol=1e-5)
+
+
+class TestTransformerConfigRecomputeMhc:
+    """Test 'mhc' in recompute_modules configuration."""
+
+    def test_config_default_value(self):
+        """Test that 'mhc' is not in recompute_modules by default."""
+        config = TransformerConfig(num_layers=2, hidden_size=64, num_attention_heads=4)
+        assert "mhc" not in config.recompute_modules
+
+    def test_config_enable_mhc_recompute(self):
+        """Test enabling 'mhc' in recompute_modules."""
+        config = TransformerConfig(
+            num_layers=2,
+            hidden_size=64,
+            num_attention_heads=4,
+            enable_hyper_connections=True,
+            num_residual_streams=4,
+            recompute_modules=["core_attn", "mhc"],
+            recompute_granularity='selective',
+        )
+        assert "mhc" in config.recompute_modules
+        assert config.enable_hyper_connections is True
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/tests/unit_tests/transformer/test_mhc_block_manager.py b/tests/unit_tests/transformer/test_mhc_block_manager.py
new file mode 100644
index 00000000000..aab004d6516
--- /dev/null
+++ b/tests/unit_tests/transformer/test_mhc_block_manager.py
@@ -0,0 +1,397 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+import pytest
+import torch
+
+from megatron.core.tensor_parallel.random import (
+    CheckpointManager,
+    CheckpointWithoutOutput,
+    initialize_rng_tracker,
+)
+from tests.unit_tests.test_utilities import Utils
+
+
+class TestCheckpointWithoutOutputManagerAPI:
+    """Test CheckpointWithoutOutput integration with CheckpointManager."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel()
+        initialize_rng_tracker(force_reset=True)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_auto_register(self):
+        """CheckpointWithoutOutput auto-registers to manager when ckpt_manager is provided."""
+        manager = CheckpointManager()
+
+        def func(x):
+            return x * 2 + 1
+
+        input_t = torch.randn(4, 4, device='cuda', requires_grad=True)
+
+        ckpt = CheckpointWithoutOutput(ckpt_manager=manager)
+        y = ckpt.checkpoint(func, input_t)
+
+        assert len(manager.checkpoints) == 1
+        assert manager.checkpoints[0] is ckpt
+
+        ckpt2 = CheckpointWithoutOutput(ckpt_manager=manager)
+        y2 = ckpt2.checkpoint(torch.nn.functional.gelu, y)
+
+        assert len(manager.checkpoints) == 2
+        assert manager.checkpoints[1] is ckpt2
+
+        loss = y2.sum()
+        manager.discard_all_outputs_and_register_unified_recompute(loss)
+        loss.backward()
+
+        assert input_t.grad is not None
+
+    def test_discard_is_noop_with_manager(self):
+        """discard_output_and_register_recompute is a NO-OP when ckpt_manager is set."""
+        manager = CheckpointManager()
+
+        def func1(x):
+            return x * 2
+
+        def func2(x):
+            return torch.nn.functional.gelu(x)
+
+        input_ref = torch.randn(4, 4, device='cuda', requires_grad=True)
+        y1_ref = func1(input_ref)
+        y2_ref = func2(y1_ref)
+        loss_ref = y2_ref.sum()
+        loss_ref.backward()
+        grad_ref = input_ref.grad.clone()
+
+        input_ckpt = input_ref.detach().clone().requires_grad_(True)
+
+        ckpt1 = CheckpointWithoutOutput(ckpt_manager=manager)
+        y1 = ckpt1.checkpoint(func1, input_ckpt)
+        ckpt1.discard_output_and_register_recompute(y1)
+
+        ckpt2 = CheckpointWithoutOutput(ckpt_manager=manager)
+        y2 = ckpt2.checkpoint(func2, y1)
+        ckpt2.discard_output_and_register_recompute(y2)
+
+        assert y1.untyped_storage().size() > 0, "y1 should NOT be discarded yet"
+        assert y2.untyped_storage().size() > 0, "y2 should NOT be discarded yet"
+
+        loss_ckpt = y2.sum()
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+
+        assert y1.untyped_storage().size() == 0, "y1 should be discarded after manager call"
+        assert y2.untyped_storage().size() == 0, "y2 should be discarded after manager call"
+
+        loss_ckpt.backward()
+        grad_ckpt = input_ckpt.grad.clone()
+
+        assert torch.allclose(grad_ckpt, grad_ref, atol=1e-6)
+
+    def test_backward_compat_without_manager(self):
+        """CheckpointWithoutOutput without ckpt_manager should work exactly as before."""
+
+        def func(x):
+            return torch.nn.functional.gelu(x)
+
+        input_ref = torch.randn(4, 4, device='cuda', requires_grad=True)
+        y_ref = func(input_ref)
+        z_ref = y_ref * 2
+        loss_ref = z_ref.sum()
+        loss_ref.backward()
+        grad_ref = input_ref.grad.clone()
+
+        input_ckpt = input_ref.detach().clone().requires_grad_(True)
+
+        ckpt = CheckpointWithoutOutput()
+        y = ckpt.checkpoint(func, input_ckpt)
+        z = y * 2
+        ckpt.discard_output_and_register_recompute(z)
+
+        assert y.untyped_storage().size() == 0
+
+        loss_ckpt = z.sum()
+        loss_ckpt.backward()
+        grad_ckpt = input_ckpt.grad.clone()
+
+        assert torch.allclose(grad_ckpt, grad_ref, atol=1e-6)
+
+    def test_error_handling(self):
+        """CheckpointManager rejects invalid add_checkpoint calls."""
+        manager = CheckpointManager()
+
+        with pytest.raises(TypeError):
+            manager.add_checkpoint("not a checkpoint")
+
+        ckpt = CheckpointWithoutOutput()
+        with pytest.raises(ValueError):
+            manager.add_checkpoint(ckpt)
+
+
+class TestCheckpointManagerSequentialChain:
+    """Test CheckpointManager with sequential checkpoint chains."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel()
+        initialize_rng_tracker(force_reset=True)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_basic_sequential_chain(self):
+        """Three sequential checkpoints: gradients match non-checkpointed version."""
+
+        def func1(x):
+            return x * 2 + 1
+
+        def func2(x):
+            return torch.nn.functional.gelu(x)
+
+        def func3(x):
+            return x * x + x
+
+        input_ref = torch.randn(4, 4, device='cuda', requires_grad=True)
+        input_ckpt = input_ref.detach().clone().requires_grad_(True)
+
+        y1_ref = func1(input_ref)
+        y2_ref = func2(y1_ref)
+        y3_ref = func3(y2_ref)
+        loss_ref = y3_ref.sum()
+        loss_ref.backward()
+        grad_ref = input_ref.grad.clone()
+
+        manager = CheckpointManager()
+
+        y1 = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(func1, input_ckpt)
+        y2 = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(func2, y1)
+        y3 = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(func3, y2)
+
+        loss_ckpt = y3.sum()
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+
+        assert y1.untyped_storage().size() == 0, "y1 storage should be released"
+        assert y2.untyped_storage().size() == 0, "y2 storage should be released"
+        assert y3.untyped_storage().size() == 0, "y3 storage should be released"
+
+        loss_ckpt.backward()
+        grad_ckpt = input_ckpt.grad.clone()
+
+        assert torch.allclose(
+            grad_ckpt, grad_ref, atol=1e-6
+        ), f"Gradients mismatch!\nWith manager: {grad_ckpt}\nReference: {grad_ref}"
+
+    def test_sequential_chain_with_dropout(self):
+        """RNG state is restored during recompute so dropout gradients match."""
+
+        def func_with_dropout(x):
+            return torch.nn.functional.dropout(x, p=0.3, training=True)
+
+        def func2(x):
+            return torch.nn.functional.gelu(x)
+
+        input_ref = torch.randn(4, 4, device='cuda', requires_grad=True)
+        input_ckpt = input_ref.detach().clone().requires_grad_(True)
+
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+
+        y1_ref = func_with_dropout(input_ref)
+        y2_ref = func2(y1_ref)
+        loss_ref = y2_ref.sum()
+        loss_ref.backward()
+        grad_ref = input_ref.grad.clone()
+
+        torch.manual_seed(42)
+        torch.cuda.manual_seed(42)
+
+        manager = CheckpointManager()
+
+        y1 = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(func_with_dropout, input_ckpt)
+        y2 = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(func2, y1)
+
+        loss_ckpt = y2.sum()
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+
+        loss_ckpt.backward()
+        grad_ckpt = input_ckpt.grad.clone()
+
+        assert torch.allclose(
+            grad_ckpt, grad_ref, atol=1e-6
+        ), f"Gradients with dropout mismatch!\nWith manager: {grad_ckpt}\nReference: {grad_ref}"
+
+    def test_multiple_outputs(self):
+        """CheckpointManager handles functions that return multiple outputs."""
+
+        def func_multi_output(x):
+            return x * 2, x + 1
+
+        def func_combine(a, b):
+            return a + b
+
+        input_ref = torch.randn(4, 4, device='cuda', requires_grad=True)
+        input_ckpt = input_ref.detach().clone().requires_grad_(True)
+
+        y1a_ref, y1b_ref = func_multi_output(input_ref)
+        y2_ref = func_combine(y1a_ref, y1b_ref)
+        loss_ref = y2_ref.sum()
+        loss_ref.backward()
+        grad_ref = input_ref.grad.clone()
+
+        manager = CheckpointManager()
+
+        y1a, y1b = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(
+            func_multi_output, input_ckpt
+        )
+        y2 = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(func_combine, y1a, y1b)
+
+        loss_ckpt = y2.sum()
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+
+        loss_ckpt.backward()
+        grad_ckpt = input_ckpt.grad.clone()
+
+        assert torch.allclose(grad_ckpt, grad_ref, atol=1e-6), (
+            f"Gradients mismatch with multiple outputs!\n"
+            f"With manager: {grad_ckpt}\nReference: {grad_ref}"
+        )
+
+
+class TestCheckpointManagerPartialCheckpoint:
+    """Test CheckpointManager with partial checkpointing (some ops not checkpointed)."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel()
+        initialize_rng_tracker(force_reset=True)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_partial_checkpoint(self):
+        """
+        Only f and h are checkpointed; g is a regular operation.
+
+        Computation chain:
+            a --[f]--> b --[g]--> c --[h]--> d --[sum]--> loss
+        """
+
+        def func_f(x):
+            return torch.nn.functional.gelu(x * 2 + 1)
+
+        def func_g(x):
+            return x * 3 - 2
+
+        def func_h(x):
+            return torch.sigmoid(x) + x
+
+        input_ref = torch.randn(4, 4, device='cuda', requires_grad=True)
+
+        b_ref = func_f(input_ref)
+        c_ref = func_g(b_ref)
+        d_ref = func_h(c_ref)
+        loss_ref = d_ref.sum()
+        loss_ref.backward()
+        grad_ref = input_ref.grad.clone()
+
+        input_ckpt = input_ref.detach().clone().requires_grad_(True)
+
+        manager = CheckpointManager()
+
+        b = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(func_f, input_ckpt)
+        c = func_g(b)
+        d = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(func_h, c)
+
+        loss_ckpt = d.sum()
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+
+        assert b.untyped_storage().size() == 0, "b storage should be released"
+        assert d.untyped_storage().size() == 0, "d storage should be released"
+        assert c.untyped_storage().size() > 0, "c storage should NOT be released (not checkpointed)"
+
+        loss_ckpt.backward()
+        grad_ckpt = input_ckpt.grad.clone()
+
+        assert torch.allclose(grad_ckpt, grad_ref, atol=1e-6), (
+            f"Gradients mismatch with partial checkpoint!\n"
+            f"With manager: {grad_ckpt}\nReference: {grad_ref}"
+        )
+
+    def test_partial_checkpoint_with_tuple_output(self):
+        """
+        Mimics HyperConnection's computation pattern with tuple outputs.
+
+        - compute_mappings: checkpointed, returns tuple (h_pre, h_post, h_res)
+        - aggregate: NOT checkpointed
+        - apply_h_res: checkpointed
+        - apply_h_post: checkpointed
+        """
+
+        def compute_mappings(x):
+            h_pre = torch.sigmoid(x.mean(dim=-1, keepdim=True).expand_as(x))
+            h_post = torch.tanh(x.sum(dim=-1, keepdim=True).expand_as(x))
+            h_res = torch.relu(x)
+            return h_pre, h_post, h_res
+
+        def aggregate(x, h_pre):
+            return x * h_pre
+
+        def apply_h_res(h_res, residual):
+            return h_res + residual * 0.5
+
+        def apply_h_post(y, h_post):
+            return y * h_post + y
+
+        x_ref = torch.randn(4, 4, device='cuda', requires_grad=True)
+        residual_ref = torch.randn(4, 4, device='cuda', requires_grad=True)
+
+        h_pre_ref, h_post_ref, h_res_ref = compute_mappings(x_ref)
+        agg_ref = aggregate(x_ref, h_pre_ref)
+        y_ref = torch.nn.functional.gelu(agg_ref)
+        mixed_ref = apply_h_res(h_res_ref, residual_ref)
+        output_ref = apply_h_post(y_ref, h_post_ref)
+        final_ref = output_ref + mixed_ref
+        loss_ref = final_ref.sum()
+        loss_ref.backward()
+        grad_x_ref = x_ref.grad.clone()
+        grad_residual_ref = residual_ref.grad.clone()
+
+        x_ckpt = x_ref.detach().clone().requires_grad_(True)
+        residual_ckpt = residual_ref.detach().clone().requires_grad_(True)
+
+        manager = CheckpointManager()
+
+        h_pre, h_post, h_res = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(
+            compute_mappings, x_ckpt
+        )
+        agg = aggregate(x_ckpt, h_pre)
+        y = torch.nn.functional.gelu(agg)
+        mixed = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(
+            apply_h_res, h_res, residual_ckpt
+        )
+        output = CheckpointWithoutOutput(ckpt_manager=manager).checkpoint(apply_h_post, y, h_post)
+
+        final = output + mixed
+        loss_ckpt = final.sum()
+
+        manager.discard_all_outputs_and_register_unified_recompute(loss_ckpt)
+
+        assert h_pre.untyped_storage().size() == 0, "h_pre storage should be released"
+        assert h_post.untyped_storage().size() == 0, "h_post storage should be released"
+        assert h_res.untyped_storage().size() == 0, "h_res storage should be released"
+        assert mixed.untyped_storage().size() == 0, "mixed storage should be released"
+        assert output.untyped_storage().size() == 0, "output storage should be released"
+
+        assert agg.untyped_storage().size() > 0, "agg storage should NOT be released"
+        assert y.untyped_storage().size() > 0, "y storage should NOT be released"
+
+        loss_ckpt.backward()
+        grad_x_ckpt = x_ckpt.grad.clone()
+        grad_residual_ckpt = residual_ckpt.grad.clone()
+
+        assert torch.allclose(
+            grad_x_ckpt, grad_x_ref, atol=1e-6
+        ), f"Gradients for x mismatch!\nWith manager: {grad_x_ckpt}\nReference: {grad_x_ref}"
+        assert torch.allclose(grad_residual_ckpt, grad_residual_ref, atol=1e-6), (
+            f"Gradients for residual mismatch!\n"
+            f"With manager: {grad_residual_ckpt}\nReference: {grad_residual_ref}"
+        )
diff --git a/tests/unit_tests/transformer/test_multi_token_prediction.py b/tests/unit_tests/transformer/test_multi_token_prediction.py
index d4d7edfe44b..4d1b08e5a88 100644
--- a/tests/unit_tests/transformer/test_multi_token_prediction.py
+++ b/tests/unit_tests/transformer/test_multi_token_prediction.py
@@ -5,6 +5,7 @@
 
 import pytest
 import torch
+from torch import Tensor
 
 from megatron.core.enums import ModelType
 from megatron.core.extensions.transformer_engine import HAVE_TE
@@ -21,11 +22,13 @@
 from megatron.core.parallel_state import get_context_parallel_group
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.hyper_connection import learned_output_contract
 from megatron.core.transformer.multi_token_prediction import (
     MTPLossLoggingHelper,
     MultiTokenPredictionBlock,
     roll_tensor,
 )
+from megatron.core.transformer.transformer_block import TransformerBlock
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.utils import is_te_min_version
 from megatron.training.arguments import core_transformer_config_from_args, parse_args, validate_args
@@ -105,7 +108,7 @@ def test_constructor_local(self, tp):
             assert num_weights == 15216 * config.mtp_num_layers
 
     @pytest.mark.skipif(not HAVE_TE, reason="transformer_engine not available")
-    @pytest.mark.parametrize(('tp', 'cp'), [(1, 1), (1, 2), (2, 1), (2, 2)])
+    @pytest.mark.parametrize(('tp', 'cp'), [(1, 1), (2, 1), (2, 2)])
     def test_constructor_ues_te(self, tp, cp):
         """Test basic construction of MTP module."""
         torch.manual_seed(_SEED)
@@ -312,7 +315,7 @@ def get_packed_batch(self, seq_lengths, micro_batch_size):
         not HAVE_TE or not is_te_min_version("2.1.0"),
         reason="grouped_gemm requires TransformerEngine >= 2.1.0",
     )
-    @pytest.mark.parametrize(("tp", "cp"), [(1, 1), (1, 2), (2, 1), (2, 2)])
+    @pytest.mark.parametrize(("tp", "cp"), [(2, 1), (2, 2)])
     def test_sharded_state_dict(self, tp, cp):
         """Test MTP with different tensor parallel sizes."""
         args = self.create_test_args(tp, cp, self.seq_length, self.micro_batch_size)
@@ -331,9 +334,8 @@ def test_sharded_state_dict(self, tp, cp):
         not HAVE_TE or not is_te_min_version("2.1.0"),
         reason="grouped_gemm requires TransformerEngine >= 2.1.0",
     )
-    @pytest.mark.parametrize("full_recompute", [False, True])
     @pytest.mark.parametrize(
-        ("tp", "cp"), [(1, 1), (1, 2), (1, 4), (2, 1), (2, 2), (2, 4), (4, 1), (4, 2)]
+        ("tp", "cp", "full_recompute"), [(1, 1, False), (1, 4, False), (2, 4, False), (4, 1, True)]
     )
     def test_forward_backward(self, tmp_path_dist_ckpt, tp, cp, full_recompute):
         """Test MTP forward and backward with gptmodel."""
@@ -927,3 +929,275 @@ def test_attention_mask_validation_mamba(self):
                 pytest.fail(f"Attention mask validation failed for Mamba hybrid model: {e}")
             else:
                 raise
+
+
+class TestLearnedOutputContract:
+    """Tests for learned_output_contract: shape, dtype, gradient, and numerical correctness."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(_SEED)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_shape_and_dtype(self):
+        """Output shape is [*, h] from [*, n*h]; dtype matches input after fp32 round-trip."""
+        seq_len, batch_size, hidden_size, n_streams = 16, 2, 64, 4
+        head_fn = torch.randn(n_streams, n_streams * hidden_size, device='cuda')
+        base = torch.zeros(n_streams, device='cuda')
+        scale = torch.ones(1, device='cuda')
+
+        for dtype in [torch.bfloat16, torch.float16]:
+            hidden_states = torch.randn(
+                seq_len, batch_size, n_streams * hidden_size, device='cuda', dtype=dtype
+            )
+            output = learned_output_contract(
+                hidden_states, head_fn, base, scale, n_streams, eps=1e-6
+            )
+            assert output.shape == (seq_len, batch_size, hidden_size)
+            assert output.dtype == dtype
+
+    def test_gradient_and_numerical_correctness(self):
+        """Gradients flow to all inputs; output matches reference implementation."""
+        torch.manual_seed(_SEED)
+        seq_len, batch_size, hidden_size, n_streams = 2, 1, 8, 2
+        eps = 1e-6
+        hidden_states = torch.randn(
+            seq_len,
+            batch_size,
+            n_streams * hidden_size,
+            device='cuda',
+            dtype=torch.float32,
+            requires_grad=True,
+        )
+        head_fn = torch.randn(n_streams, n_streams * hidden_size, device='cuda', requires_grad=True)
+        base = torch.zeros(n_streams, device='cuda', requires_grad=True)
+        scale = torch.ones(1, device='cuda', requires_grad=True)
+
+        output = learned_output_contract(hidden_states, head_fn, base, scale, n_streams, eps)
+
+        # Numerical reference
+        hs_fp32 = hidden_states.detach().clone()
+        rsqrt_ref = torch.rsqrt(hs_fp32.square().mean(-1, keepdim=True) + eps)
+        mixes_ref = torch.nn.functional.linear(hs_fp32, head_fn.detach()) * rsqrt_ref
+        pre_ref = torch.sigmoid(mixes_ref * scale.detach() + base.detach()) + 1e-6
+        y_ref = torch.sum(
+            pre_ref.unsqueeze(-1) * hs_fp32.view(*hs_fp32.shape[:-1], n_streams, -1), dim=-2
+        )
+        torch.testing.assert_close(output, y_ref, rtol=1e-4, atol=1e-4)
+
+        # Gradient flow
+        output.sum().backward()
+        for name, tensor in [
+            ("hidden_states", hidden_states),
+            ("head_fn", head_fn),
+            ("base", base),
+            ("scale", scale),
+        ]:
+            assert tensor.grad is not None, f"No gradient for {name}"
+            assert not torch.all(tensor.grad == 0), f"Zero gradient for {name}"
+
+
+class TestMHCMTPIntegration:
+    """Integration tests for mHC + MTP: constructor, TransformerBlock output, E2E."""
+
+    def setup_method(self, method):
+        os.environ['CUDA_DEVICE_MAX_CONNECTIONS'] = '1'
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+        MTPLossLoggingHelper.tracker = {}
+
+    @pytest.mark.parametrize('tp', [1, 2])
+    def test_mtp_constructor_with_mhc(self, tp):
+        """MTP layers have e_proj/h_proj (not eh_proj) and learned contraction params."""
+        torch.manual_seed(_SEED)
+        Utils.initialize_model_parallel(tensor_model_parallel_size=tp)
+        config = TransformerConfig(
+            mtp_num_layers=2,
+            num_layers=4,
+            hidden_size=64,
+            num_attention_heads=8,
+            num_residual_streams=4,
+            enable_hyper_connections=True,
+            use_cpu_initialization=True,
+            tensor_model_parallel_size=tp,
+            sequence_parallel=True if tp > 1 else False,
+        )
+        spec = get_gpt_layer_local_spec(enable_hyper_connection=True)
+        mtp_block_spec = get_gpt_mtp_block_spec(
+            config=config, spec=spec, use_transformer_engine=False
+        )
+        mtp = MultiTokenPredictionBlock(config=config, spec=mtp_block_spec)
+
+        n, h = config.num_residual_streams, config.hidden_size
+        for i in range(config.mtp_num_layers):
+            layer = mtp.layers[i]
+            assert layer.e_proj is not None and layer.h_proj is not None
+            assert layer.eh_proj is None
+            assert layer.e_proj.weight.shape == (h // tp, h)
+            assert layer.h_proj.weight.shape == (h // tp, h)
+            assert layer.hc_head_fn.shape == (n, n * h)
+            assert layer.hc_head_base.shape == (n,)
+            assert layer.hc_head_scale.shape == (1,)
+            if tp > 1:
+                assert getattr(layer.hc_head_fn, 'sequence_parallel', False)
+
+    def test_transformer_block_returns_tuple(self):
+        """With mHC+MTP the block returns (contracted, multistream); without MTP just a tensor."""
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(_SEED)
+        spec = get_gpt_layer_local_spec(enable_hyper_connection=True)
+
+        seq_len, batch_size, h, n = 16, 2, 64, 4
+
+        # With MTP: should return tuple
+        config_mtp = TransformerConfig(
+            num_layers=2,
+            hidden_size=h,
+            num_attention_heads=4,
+            enable_hyper_connections=True,
+            num_residual_streams=n,
+            use_cpu_initialization=True,
+            mtp_num_layers=2,
+        )
+        block_mtp = TransformerBlock(config_mtp, spec).cuda()
+        hidden_states = torch.randn(seq_len, batch_size, n * h, device='cuda', requires_grad=True)
+        output = block_mtp(hidden_states=hidden_states, attention_mask=None)
+
+        assert isinstance(output, tuple)
+        contracted, multistream = output
+        assert contracted.shape == (seq_len, batch_size, h)
+        assert multistream.shape == (seq_len, batch_size, n * h)
+
+        (contracted.sum() + multistream.sum()).backward()
+        assert hidden_states.grad is not None
+
+        # Without MTP: should return single tensor
+        config_no_mtp = TransformerConfig(
+            num_layers=2,
+            hidden_size=h,
+            num_attention_heads=4,
+            enable_hyper_connections=True,
+            num_residual_streams=n,
+            use_cpu_initialization=True,
+            mtp_num_layers=None,
+        )
+        block_no_mtp = TransformerBlock(config_no_mtp, spec).cuda()
+        hs2 = torch.randn(seq_len, batch_size, n * h, device='cuda')
+        output2 = block_no_mtp(hidden_states=hs2, attention_mask=None)
+        assert isinstance(output2, Tensor)
+        assert output2.shape == (seq_len, batch_size, h)
+
+    @pytest.mark.skipif(
+        not HAVE_TE or not is_te_min_version("1.7.0"), reason="TransformerEngine >= 1.7.0 required"
+    )
+    @pytest.mark.parametrize('tp', [1, 2])
+    def test_e2e_forward_backward(self, tp):
+        """GPTModel E2E with mHC + MTP: finite output, MTP loss logged, gradients on HC params."""
+        destroy_global_vars()
+        destroy_num_microbatches_calculator()
+
+        seq_length, micro_batch_size = 32, 2
+
+        sys.argv = ['test_multi_token_prediction.py']
+        args = parse_args()
+        args.num_layers = 2
+        args.mtp_num_layers = 2
+        args.mtp_loss_scaling_factor = 0.1
+        args.vocab_size = 128800
+        args.hidden_size = 128
+        args.num_attention_heads = 8
+        args.max_position_embeddings = 256
+        args.micro_batch_size = micro_batch_size
+        args.create_attention_mask_in_dataloader = True
+        args.seq_length = seq_length
+        args.tensor_model_parallel_size = tp
+        args.sequence_parallel = tp > 1
+        args.context_parallel_size = 1
+        args.position_embedding_type = 'rope'
+        args.num_experts = None
+        args.moe_grouped_gemm = False
+        args.train_iters = 1
+        args.lr = 3e-5
+        args.attention_dropout = 0.0
+        args.hidden_dropout = 0.0
+        args.add_bias_linear = False
+        args.swiglu = True
+        args.bf16 = True
+        args.enable_hyper_connections = True
+        args.num_residual_streams = 4
+        args.recompute_granularity = None
+
+        validate_args(args)
+        set_global_variables(args, False)
+        set_args(args)
+        torch.manual_seed(_SEED)
+        Utils.initialize_model_parallel(tensor_model_parallel_size=tp)
+
+        def model_provider(
+            pre_process=True,
+            post_process=True,
+            layer_spec_fn=get_gpt_layer_with_transformer_engine_spec,
+        ):
+            model_parallel_cuda_manual_seed(_SEED)
+            a = get_args()
+            config = core_transformer_config_from_args(a)
+            layer_spec = layer_spec_fn(
+                a.num_experts,
+                a.moe_grouped_gemm,
+                a.qk_layernorm,
+                enable_hyper_connection=config.enable_hyper_connections,
+            )
+            mtp_spec = get_gpt_mtp_block_spec(
+                config=config, spec=layer_spec, use_transformer_engine=True
+            )
+            return GPTModel(
+                config=config,
+                transformer_layer_spec=layer_spec,
+                mtp_block_spec=mtp_spec,
+                vocab_size=a.vocab_size,
+                max_sequence_length=a.max_position_embeddings,
+                pre_process=pre_process,
+                post_process=post_process,
+                fp16_lm_cross_entropy=a.fp16_lm_cross_entropy,
+                parallel_output=True,
+                share_embeddings_and_output_weights=not a.untie_embeddings_and_output_weights,
+                position_embedding_type=a.position_embedding_type,
+                rotary_percent=a.rotary_percent,
+            )
+
+        gpt_model, _, _ = setup_model_and_optimizer(model_provider, ModelType.encoder_or_decoder)
+
+        data = list(range(seq_length))
+        tokens = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        labels = (1 + torch.tensor(data, dtype=torch.int64)).repeat((micro_batch_size, 1)).cuda()
+        position_ids = torch.tensor(data, dtype=torch.int64).repeat((micro_batch_size, 1)).cuda()
+        attention_mask = torch.ones(
+            (micro_batch_size, 1, seq_length, seq_length), dtype=bool
+        ).cuda()
+        loss_mask = torch.ones(seq_length).repeat((micro_batch_size, 1)).cuda()
+
+        output = gpt_model[0].forward(
+            input_ids=tokens,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            labels=labels,
+            loss_mask=loss_mask,
+        )
+        assert torch.isfinite(output).all(), f"Non-finite output (TP={tp})"
+
+        tracker = MTPLossLoggingHelper.tracker
+        assert "values" in tracker, f"MTP loss not logged (TP={tp})"
+        assert torch.isfinite(tracker['values']).all()
+        MTPLossLoggingHelper.clean_loss_in_tracker()
+
+        output.mean().backward()
+        hc_param_names = ['hc_head_fn', 'hc_head_base', 'hc_head_scale']
+        for name, param in gpt_model[0].named_parameters():
+            assert param.main_grad is not None, f"No gradient for {name}"
+            if any(n in name for n in hc_param_names):
+                assert not torch.all(param.main_grad == 0), f"Zero gradient for {name}"
diff --git a/tests/unit_tests/transformer/test_rotary_base_per_layer.py b/tests/unit_tests/transformer/test_rotary_base_per_layer.py
new file mode 100644
index 00000000000..0a655094cb3
--- /dev/null
+++ b/tests/unit_tests/transformer/test_rotary_base_per_layer.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+"""Tests for per-layer RoPE base (rotary_base_per_layer) wiring in SelfAttention."""
+
+import pytest
+import torch
+
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
+from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer import TransformerConfig
+from megatron.core.transformer.attention import SelfAttention
+from tests.unit_tests.test_utilities import Utils
+
+SEQ_LEN = 16
+BATCH_SIZE = 2
+HIDDEN_SIZE = 128
+NUM_HEADS = 4
+NUM_LAYERS = 2
+ROTARY_BASE_L1 = 10000.0
+ROTARY_BASE_L2 = 5000.0
+
+
+def _make_config(rotary_base_per_layer=None) -> TransformerConfig:
+    config = TransformerConfig(
+        num_layers=NUM_LAYERS,
+        hidden_size=HIDDEN_SIZE,
+        num_attention_heads=NUM_HEADS,
+        use_cpu_initialization=True,
+        bf16=True,
+        params_dtype=torch.bfloat16,
+        rotary_base_per_layer=rotary_base_per_layer,
+    )
+    # _build_per_layer_rotary_pos_emb reads these attributes from config; they are
+    # normally injected by GPTModel but must be set manually in unit tests.
+    config.position_embedding_type = 'rope'
+    config.rotary_scaling_factor = None  # seq_len_interpolation_factor
+    config.rotary_percent = 1.0
+    config.rope_scaling = False
+    config.rope_scaling_factor = 8.0
+    return config
+
+
+def _make_attention(config: TransformerConfig, layer_number: int = 1) -> SelfAttention:
+    submodules = get_gpt_layer_local_spec().submodules.self_attention.submodules
+    return SelfAttention(config, submodules, layer_number=layer_number)
+
+
+class TestRotaryBasePerLayerInit:
+    """Verify that SelfAttention builds the correct per-layer RotaryEmbedding."""
+
+    @pytest.fixture(autouse=True)
+    def setup_teardown(self):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(42)
+        yield
+        Utils.destroy_model_parallel()
+
+    def test_rotary_pos_emb_is_rope_instance(self):
+        """rotary_pos_emb is a RotaryEmbedding when rotary_base_per_layer is set."""
+        config = _make_config([ROTARY_BASE_L1, ROTARY_BASE_L2])
+        attn = _make_attention(config, layer_number=1)
+        assert isinstance(attn.rotary_pos_emb, RotaryEmbedding)
+
+    def test_rotary_pos_emb_none_without_per_layer_config(self):
+        """rotary_pos_emb stays None when rotary_base_per_layer is not set."""
+        config = TransformerConfig(
+            num_layers=NUM_LAYERS,
+            hidden_size=HIDDEN_SIZE,
+            num_attention_heads=NUM_HEADS,
+            use_cpu_initialization=True,
+            bf16=True,
+            params_dtype=torch.bfloat16,
+        )
+        attn = _make_attention(config, layer_number=1)
+        assert attn.rotary_pos_emb is None
+
+    def test_different_bases_produce_different_inv_freq(self):
+        """Layers with distinct bases must have different inv_freq tensors."""
+        config = _make_config([ROTARY_BASE_L1, ROTARY_BASE_L2])
+        attn1 = _make_attention(config, layer_number=1)
+        attn2 = _make_attention(config, layer_number=2)
+        assert not torch.allclose(attn1.rotary_pos_emb.inv_freq, attn2.rotary_pos_emb.inv_freq)
+
+    def test_same_base_produces_identical_inv_freq(self):
+        """Layers sharing the same base must have identical inv_freq tensors."""
+        config = _make_config([ROTARY_BASE_L1, ROTARY_BASE_L1])
+        attn1 = _make_attention(config, layer_number=1)
+        attn2 = _make_attention(config, layer_number=2)
+        torch.testing.assert_close(attn1.rotary_pos_emb.inv_freq, attn2.rotary_pos_emb.inv_freq)
diff --git a/tests/unit_tests/transformer/test_te_fused_dense_mlp_spec.py b/tests/unit_tests/transformer/test_te_fused_dense_mlp_spec.py
new file mode 100644
index 00000000000..d83cca760be
--- /dev/null
+++ b/tests/unit_tests/transformer/test_te_fused_dense_mlp_spec.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+import pytest
+import torch.nn.functional as F
+
+from megatron.core.extensions.transformer_engine import (
+    HAVE_TE,
+    TEFusedDenseMLP,
+    TELayerNormColumnParallelLinear,
+    TERowParallelLinear,
+)
+from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.transformer.mlp import MLPSubmodules
+from megatron.core.transformer.transformer_config import TransformerConfig
+from megatron.core.utils import is_te_min_version
+from tests.unit_tests.test_utilities import Utils
+
+_SKIP_REASON = "TEFusedDenseMLP requires Transformer Engine >= 2.14.0"
+_SKIP = not HAVE_TE or not is_te_min_version("2.14.0")
+
+
+def _make_submodules():
+    return MLPSubmodules(linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear)
+
+
+def _make_config(**overrides):
+    defaults = dict(
+        num_layers=1,
+        hidden_size=64,
+        num_attention_heads=4,
+        activation_func=F.silu,
+        gated_linear_unit=True,
+        add_bias_linear=False,
+        use_cpu_initialization=True,
+    )
+    defaults.update(overrides)
+    return TransformerConfig(**defaults)
+
+
+@pytest.mark.skipif(_SKIP, reason=_SKIP_REASON)
+class TestTEFusedDenseMLPSpec:
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def test_instantiation(self):
+        config = _make_config()
+        mlp = TEFusedDenseMLP(config, _make_submodules())
+        assert isinstance(mlp, TEFusedDenseMLP)
+
+    def test_wrong_activation_raises(self):
+        config = _make_config(activation_func=F.gelu, gated_linear_unit=False)
+        with pytest.raises(ValueError, match="SwiGLU activation"):
+            TEFusedDenseMLP(config, _make_submodules())
+
+    def test_gated_linear_unit_false_raises(self):
+        config = _make_config(gated_linear_unit=False)
+        with pytest.raises(ValueError, match="SwiGLU activation"):
+            TEFusedDenseMLP(config, _make_submodules())
+
+    def test_add_bias_linear_raises(self):
+        config = _make_config(add_bias_linear=True)
+        with pytest.raises(ValueError, match="add_bias_linear"):
+            TEFusedDenseMLP(config, _make_submodules())
+
+    def test_norm_seq_not_registered_as_submodule(self):
+        # _norm_seq must be stored in a tuple (not directly as nn.Module) to avoid
+        # PyTorch registering it as a submodule, which would duplicate norm weights
+        # in state_dict/parameters. Verify it starts as None and is never a bare Module.
+        import torch.nn as nn
+
+        config = _make_config()
+        mlp = TEFusedDenseMLP(config, _make_submodules())
+        assert mlp._norm_seq is None
+        assert '_norm_seq' not in dict(mlp.named_children())
+
+        # Simulate what _make_fused_impl does and confirm the tuple-wrap holds.
+        import transformer_engine.pytorch.ops as te_ops
+
+        fake_seq = te_ops.Sequential()
+        mlp._norm_seq = (fake_seq,)
+        assert not isinstance(mlp._norm_seq, nn.Module)
+        assert '_norm_seq' not in dict(mlp.named_children())
diff --git a/tests/unit_tests/transformer/test_thd_correctness.py b/tests/unit_tests/transformer/test_thd_correctness.py
index ccf70b8a885..533f64081f4 100644
--- a/tests/unit_tests/transformer/test_thd_correctness.py
+++ b/tests/unit_tests/transformer/test_thd_correctness.py
@@ -30,6 +30,7 @@
 import torch.nn as nn
 
 from megatron.core import parallel_state
+from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_with_transformer_engine_spec
 from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
@@ -149,15 +150,29 @@ def compute_sbhd_padded_max_len(
 
 
 def compute_thd_padded_seqlens(
-    seqlens: List[int], cp_size: int, tp_size: int, sp_enabled: bool, pad_to_max: bool = False
+    seqlens: List[int],
+    cp_size: int,
+    tp_size: int,
+    sp_enabled: bool,
+    pad_to_max: bool = False,
+    dynamic_cp: bool = False,
 ) -> List[int]:
     """Padded per-sequence lengths for THD.
 
     When pad_to_max=True, each sequence is padded to max(seqlens) so that
     total THD tokens = max_len * B, matching SBHD. This ensures TE GEMM
     kernels see identical M dimensions for bitwise comparison.
+
+    When dynamic_cp=True, pad to the global upper-bound CP size so that the
+    same packed layout works regardless of which dynamic CP sub-group the
+    sequence lands in.
     """
-    cp_divisor = 2 * cp_size if cp_size > 1 else 1
+    if dynamic_cp:
+        MAX_CP_SIZE = 8
+        effective_cp = max(cp_size, MAX_CP_SIZE)
+    else:
+        effective_cp = cp_size
+    cp_divisor = 2 * effective_cp if cp_size > 1 else 1
     if pad_to_max:
         max_len = _round_up(max(seqlens), cp_divisor)
         padded = [max_len] * len(seqlens)
@@ -181,6 +196,7 @@ def make_packed_seq_params(
     tp_size: int = 1,
     sp_enabled: bool = False,
     pad_to_max: bool = False,
+    dynamic_cp: bool = False,
 ) -> PackedSeqParams:
     """Create PackedSeqParams with cu_seqlens and cu_seqlens_padded."""
 
@@ -190,7 +206,9 @@ def to_cu_seqlens(lens):
             cu[i + 1] = cu[i] + l
         return cu.cuda()
 
-    padded = compute_thd_padded_seqlens(seqlens, cp_size, tp_size, sp_enabled, pad_to_max)
+    padded = compute_thd_padded_seqlens(
+        seqlens, cp_size, tp_size, sp_enabled, pad_to_max, dynamic_cp=dynamic_cp
+    )
     return PackedSeqParams(
         cu_seqlens_q=to_cu_seqlens(seqlens),
         cu_seqlens_kv=to_cu_seqlens(seqlens),
@@ -362,10 +380,21 @@ def shard_sbhd(tensor, cp_rank, cp_size, tp_rank, tp_size, sp_enabled):
 
 
 def shard_thd(
-    seq_data_list, seqlens, cp_rank, cp_size, tp_rank, tp_size, sp_enabled, H, pad_to_max=False
+    seq_data_list,
+    seqlens,
+    cp_rank,
+    cp_size,
+    tp_rank,
+    tp_size,
+    sp_enabled,
+    H,
+    pad_to_max=False,
+    dynamic_cp=False,
 ):
     """Shard per-sequence data into local THD [local_T, 1, H]."""
-    padded = compute_thd_padded_seqlens(seqlens, cp_size, tp_size, sp_enabled, pad_to_max)
+    padded = compute_thd_padded_seqlens(
+        seqlens, cp_size, tp_size, sp_enabled, pad_to_max, dynamic_cp=dynamic_cp
+    )
 
     chunks = []
     for data, sl, psl in zip(seq_data_list, seqlens, padded):
@@ -446,7 +475,7 @@ class _GatherTHD(torch.autograd.Function):
     """Gather THD outputs from all ranks with gradient support."""
 
     @staticmethod
-    def forward(ctx, local, seqlens, cp_size, tp_size, sp_enabled, H, pad_to_max):
+    def forward(ctx, local, seqlens, cp_size, tp_size, sp_enabled, H, pad_to_max, dynamic_cp):
         ctx.seqlens, ctx.cp_size, ctx.tp_size, ctx.sp_enabled, ctx.H = (
             seqlens,
             cp_size,
@@ -456,7 +485,9 @@ def forward(ctx, local, seqlens, cp_size, tp_size, sp_enabled, H, pad_to_max):
         )
         ctx.cp_rank = parallel_state.get_context_parallel_rank() if cp_size > 1 else 0
         ctx.tp_rank = parallel_state.get_tensor_model_parallel_rank()
-        ctx.padded = compute_thd_padded_seqlens(seqlens, cp_size, tp_size, sp_enabled, pad_to_max)
+        ctx.padded = compute_thd_padded_seqlens(
+            seqlens, cp_size, tp_size, sp_enabled, pad_to_max, dynamic_cp=dynamic_cp
+        )
 
         out = local
         if sp_enabled:
@@ -495,7 +526,7 @@ def backward(ctx, grad):
         if ctx.sp_enabled:
             seg = packed.shape[0] // ctx.tp_size
             packed = packed[ctx.tp_rank * seg : (ctx.tp_rank + 1) * seg]
-        return packed.unsqueeze(1).contiguous(), None, None, None, None, None, None
+        return packed.unsqueeze(1).contiguous(), None, None, None, None, None, None, None
 
 
 def gather_sbhd(local, cp_size, tp_size, sp_enabled):
@@ -504,8 +535,8 @@ def gather_sbhd(local, cp_size, tp_size, sp_enabled):
     return _GatherSBHD.apply(local, cp_size, tp_size, sp_enabled)
 
 
-def gather_thd(local, seqlens, cp_size, tp_size, sp_enabled, H, pad_to_max=False):
-    return _GatherTHD.apply(local, seqlens, cp_size, tp_size, sp_enabled, H, pad_to_max)
+def gather_thd(local, seqlens, cp_size, tp_size, sp_enabled, H, pad_to_max=False, dynamic_cp=False):
+    return _GatherTHD.apply(local, seqlens, cp_size, tp_size, sp_enabled, H, pad_to_max, dynamic_cp)
 
 
 # =============================================================================
@@ -647,3 +678,386 @@ def test_thd_format(tc: TestCase):
     if tc.forward_bitwise or tc.backward_bitwise:
         torch.use_deterministic_algorithms(False)
         os.environ.pop("NVTE_ALLOW_NONDETERMINISTIC_ALGO", None)
+
+
+# =============================================================================
+# Dynamic CP Test Infrastructure
+# =============================================================================
+
+
+@dataclass
+class DynamicCPAssignment:
+    """Per-rank assignment in the dynamic CP configuration.
+
+    local_cp_size: number of ranks in this rank's CP communicator.
+    seq_indices: indices into the test case's seqlens list that this rank processes.
+
+    Ranks sharing the same CP sub-group have identical DynamicCPAssignment values.
+    """
+
+    local_cp_size: int
+    seq_indices: List[int]
+
+
+@dataclass
+class DynamicCPTestCase:
+    """Test case for dynamic CP correctness.
+
+    Compares fixed CP (baseline) against dynamic CP where sub-groups of ranks
+    can process different sequences with different CP sizes.
+
+    dcp_assignments: one entry per DP×CP rank (len == dp_cp_world_size).
+    Ranks in the same sub-group share the same local_cp_size and seq_indices.
+    """
+
+    name: str
+    hidden_size: int
+    num_heads: int
+    num_kv_heads: int
+    ffn_hidden_size: int
+    seqlens: List[int]
+    tp_size: int
+    cp_size: int
+    sp_enabled: bool
+    dcp_assignments: List[DynamicCPAssignment]
+
+
+# Dynamic CP Test Cases
+# ---------------------
+# Each test runs two paths through the *same* TransformerLayer and compares
+# forward outputs + backward gradients (similarity check with TE attention).
+#
+# Parameters:
+#   cp_size — the CP size used for the *baseline* (fixed CP) path.  It also
+#   determines dp_size = world_size // (tp_size * cp_size), which controls how
+#   sequences are split across DP ranks in the baseline.  The dynamic CP path
+#   ignores this cp_size and instead uses the local_cp_size from each
+#   DynamicCPAssignment.
+#
+# Baseline (fixed CP):
+#   Sequences are evenly split across DP ranks (seqs_per_dp = len(seqlens) //
+#   dp_size).  Each DP rank runs standard CP (cp_size) on its subset:
+#   pad → zigzag shard → forward → gather → backward.
+#
+# Dynamic CP:
+#   dcp_assignments has one entry per DP×CP rank.  Ranks sharing a CP sub-group
+#   have identical (local_cp_size, seq_indices).  Each sub-group forms its own
+#   CP communicator and independently shards / gathers only the sequences
+#   assigned to it.
+#
+# Sequence lengths are intentionally non-powers-of-two (mostly primes) so
+# that padding to cp_divisor is always exercised.
+#
+# fmt: off
+_A = DynamicCPAssignment
+DYNAMIC_CP_TEST_CASES = [
+    # -------------------------------------------------------------------------
+    # Uniform: all dp_cp ranks share all seqs with larger local_cp_size.
+    # All 4 ranks form one sub-group → equivalent to fixed CP but via the
+    # dynamic CP code path.
+    # -------------------------------------------------------------------------
+    # tp=2, cp=2, world_size=8 → dp_cp_size=4, all ranks get same assignment
+    DynamicCPTestCase(
+        "dcp_uniform_tp2_cp2_sp",
+        4096, 32, 8, 14336,
+        [3947, 1999, 1037, 4091, 2111, 503],
+        tp_size=2, cp_size=2, sp_enabled=True,
+        dcp_assignments=[
+            _A(4, [0, 1, 2, 3, 4, 5]),  # dp_cp_rank 0
+            _A(4, [0, 1, 2, 3, 4, 5]),  # dp_cp_rank 1
+            _A(4, [0, 1, 2, 3, 4, 5]),  # dp_cp_rank 2
+            _A(4, [0, 1, 2, 3, 4, 5]),  # dp_cp_rank 3
+        ],
+    ),
+    # tp=1, cp=2, world_size=8 → dp_cp_size=8, all ranks get same assignment
+    DynamicCPTestCase(
+        "dcp_uniform_tp1_cp2",
+        1024, 16, 4, 4096,
+        [4001, 2039, 997, 511, 3967, 2053, 1009, 499],
+        tp_size=1, cp_size=2, sp_enabled=False,
+        dcp_assignments=[
+            _A(8, [0, 1, 2, 3, 4, 5, 6, 7]),  # dp_cp_rank 0
+            _A(8, [0, 1, 2, 3, 4, 5, 6, 7]),  # dp_cp_rank 1
+            _A(8, [0, 1, 2, 3, 4, 5, 6, 7]),  # dp_cp_rank 2
+            _A(8, [0, 1, 2, 3, 4, 5, 6, 7]),  # dp_cp_rank 3
+            _A(8, [0, 1, 2, 3, 4, 5, 6, 7]),  # dp_cp_rank 4
+            _A(8, [0, 1, 2, 3, 4, 5, 6, 7]),  # dp_cp_rank 5
+            _A(8, [0, 1, 2, 3, 4, 5, 6, 7]),  # dp_cp_rank 6
+            _A(8, [0, 1, 2, 3, 4, 5, 6, 7]),  # dp_cp_rank 7
+        ],
+    ),
+    # -------------------------------------------------------------------------
+    # Heterogeneous: sub-groups with different local_cp_size.
+    # Ranks are split into multiple CP sub-groups; some ranks process
+    # sequences alone (local_cp_size=1) while others cooperate (local_cp_size=2+).
+    # -------------------------------------------------------------------------
+    # tp=2, cp=4, world_size=8 → dp_cp_size=4
+    #   rank 0: alone (cp=1), rank 1: alone (cp=1), ranks 2-3: pair (cp=2)
+    DynamicCPTestCase(
+        "dcp_hetero_tp2_cp4_sp",
+        4096, 32, 8, 14336,
+        [4093, 2017, 3989, 2111, 1013, 509],
+        tp_size=2, cp_size=4, sp_enabled=True,
+        dcp_assignments=[
+            _A(1, [0]),              # dp_cp_rank 0: solo
+            _A(1, [1]),              # dp_cp_rank 1: solo
+            _A(2, [2, 3, 4, 5]),     # dp_cp_rank 2: pair with rank 3
+            _A(2, [2, 3, 4, 5]),     # dp_cp_rank 3: pair with rank 2
+        ],
+    ),
+    # tp=1, cp=4, world_size=8 → dp_cp_size=8
+    #   ranks 0,1: solo; ranks 2-3: pair; ranks 4,5: solo; ranks 6-7: pair
+    DynamicCPTestCase(
+        "dcp_hetero_tp1_cp4",
+        1024, 16, 4, 4096,
+        [4007, 2003, 3989, 2053, 4091, 2017, 1013, 503],
+        tp_size=1, cp_size=4, sp_enabled=False,
+        dcp_assignments=[
+            _A(1, [0]),          # dp_cp_rank 0: solo
+            _A(1, [1]),          # dp_cp_rank 1: solo
+            _A(2, [2, 3]),       # dp_cp_rank 2: pair with rank 3
+            _A(2, [2, 3]),       # dp_cp_rank 3: pair with rank 2
+            _A(1, [4]),          # dp_cp_rank 4: solo
+            _A(1, [5]),          # dp_cp_rank 5: solo
+            _A(2, [6, 7]),       # dp_cp_rank 6: pair with rank 7
+            _A(2, [6, 7]),       # dp_cp_rank 7: pair with rank 6
+        ],
+    ),
+    # -------------------------------------------------------------------------
+    # Mixed: cp4 + cp2 + cp1 + cp1, baseline fixed cp=2.
+    # tp=1, cp=2, world_size=8 → dp_cp_size=8, dp_size=4
+    #   ranks 0-3: quad (cp=4), ranks 4-5: pair (cp=2), rank 6: solo, rank 7: solo
+    # -------------------------------------------------------------------------
+    DynamicCPTestCase(
+        "dcp_mixed_tp1_cp2",
+        1024, 16, 4, 4096,
+        [4007, 2003, 3989, 2053, 4091, 2017, 1013, 503],
+        tp_size=1, cp_size=2, sp_enabled=False,
+        dcp_assignments=[
+            _A(4, [0, 1, 2, 3]),     # dp_cp_rank 0: quad with ranks 1,2,3
+            _A(4, [0, 1, 2, 3]),     # dp_cp_rank 1: quad with ranks 0,2,3
+            _A(4, [0, 1, 2, 3]),     # dp_cp_rank 2: quad with ranks 0,1,3
+            _A(4, [0, 1, 2, 3]),     # dp_cp_rank 3: quad with ranks 0,1,2
+            _A(2, [4, 5]),           # dp_cp_rank 4: pair with rank 5
+            _A(2, [4, 5]),           # dp_cp_rank 5: pair with rank 4
+            _A(1, [6]),              # dp_cp_rank 6: solo
+            _A(1, [7]),              # dp_cp_rank 7: solo
+        ],
+    ),
+]
+# fmt: on
+
+
+# =============================================================================
+# Dynamic CP Gather (with explicit cp_group)
+# =============================================================================
+
+
+class _GatherTHDDynamic(torch.autograd.Function):
+    """Gather THD outputs from an explicit CP group with gradient support."""
+
+    @staticmethod
+    def forward(ctx, local, seqlens, cp_size, tp_size, sp_enabled, H, cp_group, cp_rank):
+        ctx.seqlens, ctx.cp_size, ctx.tp_size, ctx.sp_enabled, ctx.H = (
+            seqlens,
+            cp_size,
+            tp_size,
+            sp_enabled,
+            H,
+        )
+        ctx.cp_rank = cp_rank
+        ctx.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        ctx.padded = compute_thd_padded_seqlens(
+            seqlens, cp_size, tp_size, sp_enabled, False, dynamic_cp=True
+        )
+
+        out = local
+        if sp_enabled:
+            gathered = [torch.empty_like(out) for _ in range(tp_size)]
+            dist.all_gather(
+                gathered, out.contiguous(), group=parallel_state.get_tensor_model_parallel_group()
+            )
+            out = torch.cat(gathered, dim=0)
+
+        if cp_size > 1:
+            local_lens = [p // cp_size for p in ctx.padded]
+            offset, seqs = 0, []
+            for i, ll in enumerate(local_lens):
+                chunk = out[offset : offset + ll]
+                gathered = [torch.empty_like(chunk) for _ in range(cp_size)]
+                dist.all_gather(gathered, chunk.contiguous(), group=cp_group)
+                seqs.append(_zigzag_merge(gathered, cp_size)[: seqlens[i]])
+                offset += ll
+            out = torch.cat(seqs, dim=0)
+        else:
+            out = _strip_thd_padding(out, seqlens, ctx.padded)
+        return out
+
+    @staticmethod
+    def backward(ctx, grad):
+        offset, chunks = 0, []
+        for sl, psl in zip(ctx.seqlens, ctx.padded):
+            g = grad[offset : offset + sl, 0, :]
+            if psl > sl:
+                g = torch.cat([g, torch.zeros(psl - sl, ctx.H, dtype=g.dtype, device=g.device)])
+            chunks.append(_zigzag_split(g, ctx.cp_rank, ctx.cp_size))
+            offset += sl
+
+        packed = torch.cat(chunks, dim=0)
+        if ctx.sp_enabled:
+            seg = packed.shape[0] // ctx.tp_size
+            packed = packed[ctx.tp_rank * seg : (ctx.tp_rank + 1) * seg]
+        return packed.unsqueeze(1).contiguous(), None, None, None, None, None, None, None
+
+
+def gather_thd_dynamic(local, seqlens, cp_size, tp_size, sp_enabled, H, cp_group, cp_rank):
+    return _GatherTHDDynamic.apply(
+        local, seqlens, cp_size, tp_size, sp_enabled, H, cp_group, cp_rank
+    )
+
+
+# =============================================================================
+# Dynamic CP Test Function
+# =============================================================================
+
+
+@pytest.mark.parametrize("tc", DYNAMIC_CP_TEST_CASES, ids=lambda tc: tc.name)
+def test_dynamic_cp_format(tc: DynamicCPTestCase):
+    """Compare fixed CP THD vs dynamic CP THD format outputs and gradients."""
+    H, seqlens = tc.hidden_size, tc.seqlens
+    tp_size, cp_size, sp = tc.tp_size, tc.cp_size, tc.sp_enabled
+
+    Utils.initialize_model_parallel(
+        tensor_model_parallel_size=tp_size,
+        context_parallel_size=cp_size,
+        dynamic_context_parallel=True,
+    )
+    model_parallel_cuda_manual_seed(42)
+
+    layer = build_gpt_layer(
+        H,
+        tc.num_heads,
+        tc.num_kv_heads,
+        tc.ffn_hidden_size,
+        tp_size,
+        cp_size,
+        sp,
+        use_mock_attention=False,
+        deterministic=False,
+    )
+    kv_channels = H // tc.num_heads
+    rope = RotaryEmbedding(kv_channels=kv_channels, rotary_percent=1.0).cuda()
+
+    cp_rank = parallel_state.get_context_parallel_rank()
+    tp_rank = parallel_state.get_tensor_model_parallel_rank()
+    dp_rank = parallel_state.get_data_parallel_rank()
+    dp_size = parallel_state.get_data_parallel_world_size()
+
+    # All ranks generate identical full data (same seed, no dp_rank offset)
+    torch.manual_seed(42)
+    all_seq_data = [torch.randn(sl, H, dtype=torch.bfloat16).cuda() for sl in seqlens]
+    torch.manual_seed(142)
+    all_grad_data = [torch.randn(sl, H, dtype=torch.bfloat16).cuda() for sl in seqlens]
+
+    # === Baseline: fixed CP, THD format ===
+    assert (
+        len(seqlens) % dp_size == 0
+    ), f"Need len(seqlens)={len(seqlens)} divisible by dp_size={dp_size}"
+    seqs_per_dp = len(seqlens) // dp_size
+    base_indices = list(range(dp_rank * seqs_per_dp, (dp_rank + 1) * seqs_per_dp))
+    base_seqlens = [seqlens[i] for i in base_indices]
+    base_seq_data = [all_seq_data[i] for i in base_indices]
+    base_grad_data = [all_grad_data[i] for i in base_indices]
+
+    local_thd_base = shard_thd(
+        base_seq_data, base_seqlens, cp_rank, cp_size, tp_rank, tp_size, sp, H, dynamic_cp=True
+    )
+    packed_base = make_packed_seq_params(base_seqlens, cp_size, tp_size, sp, dynamic_cp=True)
+    rotary_pos_emb_base = rope(packed_base.max_seqlen_q, packed_seq=True)
+    input_base = local_thd_base.detach().clone().requires_grad_(True)
+    out_base, _ = layer(
+        hidden_states=input_base, packed_seq_params=packed_base, rotary_pos_emb=rotary_pos_emb_base
+    )
+    gathered_base = gather_thd(out_base, base_seqlens, cp_size, tp_size, sp, H, dynamic_cp=True)
+    grad_base = torch.cat(base_grad_data, dim=0).unsqueeze(1)
+    gathered_base.backward(grad_base)
+    baseline_grads = {n: p.grad.clone() for n, p in layer.named_parameters()}
+    layer.zero_grad()
+
+    # === Dynamic CP ===
+    dp_cp_group = parallel_state.get_data_parallel_group(with_context_parallel=True)
+    dp_cp_rank = dist.get_rank(group=dp_cp_group)
+
+    assert dp_cp_rank < len(
+        tc.dcp_assignments
+    ), f"dp_cp_rank={dp_cp_rank} out of range (len={len(tc.dcp_assignments)})"
+    my_assignment = tc.dcp_assignments[dp_cp_rank]
+    local_cp_size = my_assignment.local_cp_size
+    dcp_indices = my_assignment.seq_indices
+    dcp_seqlens = [seqlens[i] for i in dcp_indices]
+    dcp_seq_data = [all_seq_data[i] for i in dcp_indices]
+    dcp_grad_data = [all_grad_data[i] for i in dcp_indices]
+
+    dcp_cp_group = parallel_state.get_dynamic_data_context_parallel_groups(group_size=local_cp_size)
+    dcp_cp_rank = dist.get_rank(group=dcp_cp_group)
+
+    local_thd_dcp = shard_thd(
+        dcp_seq_data,
+        dcp_seqlens,
+        dcp_cp_rank,
+        local_cp_size,
+        tp_rank,
+        tp_size,
+        sp,
+        H,
+        dynamic_cp=True,
+    )
+    packed_dcp = make_packed_seq_params(dcp_seqlens, local_cp_size, tp_size, sp, dynamic_cp=True)
+    packed_dcp.local_cp_size = local_cp_size
+    packed_dcp.cp_group = dcp_cp_group
+    rotary_pos_emb_dcp = rope(packed_dcp.max_seqlen_q, packed_seq=True)
+
+    input_dcp = local_thd_dcp.detach().clone().requires_grad_(True)
+    out_dcp, _ = layer(
+        hidden_states=input_dcp, packed_seq_params=packed_dcp, rotary_pos_emb=rotary_pos_emb_dcp
+    )
+    gathered_dcp = gather_thd_dynamic(
+        out_dcp, dcp_seqlens, local_cp_size, tp_size, sp, H, dcp_cp_group, dcp_cp_rank
+    )
+    grad_dcp = torch.cat(dcp_grad_data, dim=0).unsqueeze(1)
+    gathered_dcp.backward(grad_dcp)
+    dcp_grads = {n: p.grad.clone() for n, p in layer.named_parameters()}
+
+    # === Gradient sync: reduce across all DP×CP ranks ===
+    for n in baseline_grads:
+        dist.all_reduce(baseline_grads[n], group=dp_cp_group)
+        dist.all_reduce(dcp_grads[n], group=dp_cp_group)
+    if sp:
+        tp_group = parallel_state.get_tensor_model_parallel_group()
+        for n, p in layer.named_parameters():
+            if getattr(p, "sequence_parallel", False):
+                dist.all_reduce(baseline_grads[n], group=tp_group)
+                dist.all_reduce(dcp_grads[n], group=tp_group)
+
+    # === Forward comparison (per-sequence, on ranks that have both) ===
+    common_indices = sorted(set(base_indices) & set(dcp_indices))
+    for seq_idx in common_indices:
+        sl = seqlens[seq_idx]
+        base_pos = base_indices.index(seq_idx)
+        base_offset = sum(base_seqlens[:base_pos])
+        dcp_pos = dcp_indices.index(seq_idx)
+        dcp_offset = sum(dcp_seqlens[:dcp_pos])
+        assert_close(
+            f"seq[{seq_idx}] output",
+            gathered_base[base_offset : base_offset + sl, 0].detach(),
+            gathered_dcp[dcp_offset : dcp_offset + sl, 0].detach(),
+            False,
+        )
+
+    # === Backward comparison ===
+    for n in baseline_grads:
+        if n in dcp_grads:
+            assert_close(f"grad[{n}]", baseline_grads[n], dcp_grads[n], False)
+
+    # === Cleanup ===
+    Utils.destroy_model_parallel()
diff --git a/tests/unit_tests/transformer/test_transformer_layer.py b/tests/unit_tests/transformer/test_transformer_layer.py
index da1f9ce5860..c80b8f14480 100644
--- a/tests/unit_tests/transformer/test_transformer_layer.py
+++ b/tests/unit_tests/transformer/test_transformer_layer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
 
 import pytest
@@ -8,17 +8,41 @@
 from megatron.core.dist_checkpointing.mapping import ShardedObject, ShardedTensor
 from megatron.core.inference.contexts import StaticInferenceContext
 from megatron.core.models.gpt.gpt_layer_specs import (
+    get_gpt_layer_with_transformer_engine_spec,
     get_gpt_layer_with_transformer_engine_submodules,
 )
-from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
+from megatron.core.tensor_parallel.random import CheckpointManager, model_parallel_cuda_manual_seed
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import (
+    HyperConnectionTransformerLayer,
     TransformerLayer,
     get_transformer_layer_offset,
 )
 from tests.unit_tests.test_utilities import Utils
 
 
+def _make_mhc_config(hidden_size=64, num_streams=4, **extra):
+    """Build a TransformerConfig with common MHC defaults.
+
+    Any default can be overridden via **extra
+    (e.g. ``_make_mhc_config(num_layers=8, recompute_modules=["core_attn", "mhc"])``).
+    """
+    base = dict(
+        num_layers=2,
+        hidden_size=hidden_size,
+        num_attention_heads=4,
+        use_cpu_initialization=True,
+        enable_hyper_connections=True,
+        num_residual_streams=num_streams,
+        mhc_sinkhorn_iterations=5,
+        mhc_init_gating_factor=0.01,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+    )
+    base.update(extra)
+    return TransformerConfig(**base)
+
+
 class TestParallelTransformerLayer:
 
     def setup_method(self, method):
@@ -313,3 +337,765 @@ def get_tensor_shapes_for_tp(transformer_config, tp_size):
         'self_attention.linear_qkv.weight': (hs * 3 // tp_size, hs),
         'self_attention.linear_qkv.bias': (hs * 3 // tp_size,),
     }
+
+
+class TestTransformerLayerWithHyperConnectionRecompute:
+    """Test TransformerLayer with HyperConnection and MHC block recomputation."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def _create_layer_with_hyper_connection(
+        self, hidden_size=64, num_streams=4, layer_number=1, **extra
+    ):
+        """Create a HyperConnectionTransformerLayer with hyper connection enabled."""
+        config = _make_mhc_config(
+            hidden_size=hidden_size,
+            num_streams=num_streams,
+            recompute_modules=["core_attn", "mhc"],
+            recompute_granularity='selective',
+            **extra,
+        )
+        layer_spec = get_gpt_layer_with_transformer_engine_spec(enable_hyper_connection=True)
+        layer = HyperConnectionTransformerLayer(
+            config, layer_spec.submodules, layer_number=layer_number
+        )
+        layer.cuda()
+        return layer, config
+
+    def test_forward_with_hyper_connection_recompute(self):
+        """
+        Test that TransformerLayer forward works correctly with HyperConnection
+        and MHC block recomputation enabled.
+        """
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+
+        layer, config = self._create_layer_with_hyper_connection(hidden_size, num_streams)
+        layer.train()  # Enable training mode for recomputation
+
+        # Input shape: [seq_len, batch_size, n * hidden_size] for hyper connections
+        n_channels = num_streams * hidden_size
+        hidden_states = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        # Create manager for MHC block recomputation
+        manager = CheckpointManager()
+
+        # Forward pass with recompute manager
+        manager.is_last_layer_in_recompute_block = True
+        output, context = layer(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            mhc_recompute_manager=manager,
+        )
+
+        # Verify output shape
+        assert output.shape == (
+            seq_len,
+            batch_size,
+            n_channels,
+        ), f"Expected output shape {(seq_len, batch_size, n_channels)}, got {output.shape}"
+
+        # Register unified recompute hook at block boundary.
+        manager.discard_all_outputs_and_register_unified_recompute(output)
+
+        # Backward pass should work without error
+        loss = output.sum()
+        loss.backward()
+
+        # Verify gradients exist
+        assert hidden_states.grad is not None, "Gradients should be computed for hidden_states"
+        assert hidden_states.grad.shape == hidden_states.shape
+
+    def test_intermediate_layer_with_recompute(self):
+        """
+        Test TransformerLayer as an intermediate layer (not last in block).
+        In this case, MLP BDA should also be checkpointed.
+        """
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+
+        layer, config = self._create_layer_with_hyper_connection(hidden_size, num_streams)
+        layer.train()
+
+        n_channels = num_streams * hidden_size
+        hidden_states = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        manager = CheckpointManager()
+
+        # Forward pass - NOT the last layer in block
+        manager.is_last_layer_in_recompute_block = False
+        output, context = layer(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            mhc_recompute_manager=manager,
+        )
+
+        # Verify output shape
+        assert output.shape == (seq_len, batch_size, n_channels)
+
+        # Backward pass should work
+        loss = output.sum()
+        # For intermediate layers, we need to pass output to next layer
+        # Here we just register the recompute hook on output for testing
+        manager.discard_all_outputs_and_register_unified_recompute(loss)
+
+        loss.backward()
+
+        assert hidden_states.grad is not None
+        assert hidden_states.grad.shape == hidden_states.shape
+
+    def test_multiple_layers_chain_with_recompute(self):
+        """
+        Test multiple TransformerLayers chained together with a single
+        CheckpointManager, simulating TransformerBlock behavior.
+        """
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+        num_layers = 3
+
+        layers = [
+            self._create_layer_with_hyper_connection(
+                hidden_size, num_streams, layer_number=i + 1, num_layers=num_layers
+            )[0]
+            for i in range(num_layers)
+        ]
+
+        for layer in layers:
+            layer.train()
+
+        n_channels = num_streams * hidden_size
+        hidden_states = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        # Single manager for all layers (like TransformerBlock)
+        manager = CheckpointManager()
+
+        # Forward through all layers
+        h = hidden_states
+        for i, layer in enumerate(layers):
+            is_last = i == num_layers - 1
+            manager.is_last_layer_in_recompute_block = is_last
+            h, _ = layer(
+                hidden_states=h, attention_mask=attention_mask, mhc_recompute_manager=manager
+            )
+            if is_last:
+                manager.discard_all_outputs_and_register_unified_recompute(h)
+
+        # Backward pass
+        loss = h.sum()
+        loss.backward()
+
+        # Verify gradients
+        assert hidden_states.grad is not None
+        assert hidden_states.grad.shape == hidden_states.shape
+        # Check that gradient is non-trivial (not all zeros)
+        assert hidden_states.grad.abs().sum() > 0
+
+
+class TestMHCRecomputeMemorySaving:
+    """Verify that 'mhc' in recompute_modules actually reduces peak GPU memory."""
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    @staticmethod
+    def _run_forward_backward(
+        num_layers,
+        hidden_size,
+        num_streams,
+        seq_len,
+        batch_size,
+        use_recompute,
+        recompute_block_size=2,
+    ):
+        """Run a full forward + backward pass and return (peak memory, output grad).
+
+        When use_recompute=True, a new CheckpointManager is created every
+        `recompute_block_size` layers, mirroring TransformerBlock's
+        _build_mhc_recompute_layer_plan logic.
+        """
+        config = _make_mhc_config(
+            hidden_size=hidden_size,
+            num_streams=num_streams,
+            num_layers=num_layers,
+            recompute_modules=["core_attn", "mhc"] if use_recompute else None,
+            recompute_granularity='selective' if use_recompute else None,
+        )
+        layer_spec = get_gpt_layer_with_transformer_engine_spec(enable_hyper_connection=True)
+        layers = [
+            HyperConnectionTransformerLayer(
+                config, layer_spec.submodules, layer_number=i + 1
+            ).cuda()
+            for i in range(num_layers)
+        ]
+        for layer in layers:
+            layer.train()
+
+        n_channels = num_streams * hidden_size
+        hidden_states = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
+
+        manager = CheckpointManager() if use_recompute else None
+
+        h = hidden_states
+        for i, layer in enumerate(layers):
+            is_last_in_block = (i == num_layers - 1) or ((i + 1) % recompute_block_size == 0)
+            kwargs = dict(hidden_states=h, attention_mask=attention_mask)
+            if manager is not None:
+                manager.is_last_layer_in_recompute_block = is_last_in_block
+                kwargs['mhc_recompute_manager'] = manager
+            h, _ = layer(**kwargs)
+            if manager is not None and is_last_in_block:
+                manager.discard_all_outputs_and_register_unified_recompute(h)
+                if i < num_layers - 1:
+                    manager = CheckpointManager()
+
+        loss = h.sum()
+        loss.backward()
+        torch.cuda.synchronize()
+
+        peak_mem = torch.cuda.max_memory_allocated()
+        grad = hidden_states.grad.clone()
+
+        del layers, hidden_states, h, loss, manager
+        torch.cuda.empty_cache()
+
+        return peak_mem, grad
+
+    def test_recompute_reduces_peak_memory(self):
+        """Peak memory with recompute (block_size=2) should be lower than without."""
+        num_layers = 8
+        hidden_size = 128
+        num_streams = 4
+        seq_len = 64
+        batch_size = 4
+
+        peak_no_recompute, _ = self._run_forward_backward(
+            num_layers, hidden_size, num_streams, seq_len, batch_size, use_recompute=False
+        )
+        peak_recompute, _ = self._run_forward_backward(
+            num_layers,
+            hidden_size,
+            num_streams,
+            seq_len,
+            batch_size,
+            use_recompute=True,
+            recompute_block_size=2,
+        )
+
+        saving_pct = (peak_no_recompute - peak_recompute) / peak_no_recompute * 100
+
+        assert peak_recompute < peak_no_recompute, (
+            f"Recompute should reduce peak memory, but got "
+            f"no_recompute={peak_no_recompute / 1e6:.1f}MB vs "
+            f"recompute={peak_recompute / 1e6:.1f}MB "
+            f"(saving={saving_pct:.1f}%)"
+        )
+
+
+class TestMHCWithCudaGraph:
+    """Test HyperConnectionTransformerLayer compatibility with CUDA graphs.
+
+    CUDA graph capture requires static computation graphs and fixed tensor shapes.
+    These tests verify that the mHC layer properly supports the CUDA graph interface
+    defined in GraphableMegatronModule and TransformerLayer.
+    """
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123, use_cudagraphable_rng=True, force_reset_rng=True)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def _create_mhc_layer(self, hidden_size=64, num_streams=4, **extra_config):
+        config = _make_mhc_config(hidden_size=hidden_size, num_streams=num_streams, **extra_config)
+        layer_spec = get_gpt_layer_with_transformer_engine_spec(enable_hyper_connection=True)
+        layer = HyperConnectionTransformerLayer(config, layer_spec.submodules)
+        layer.cuda()
+        return layer, config
+
+    def test_get_layer_static_inputs_shape_for_mhc(self):
+        """get_layer_static_inputs must return [s, b, n*C] for mHC layers.
+
+        CUDA graph capture creates static buffers whose shapes are determined by
+        this method. If the shape is [s, b, C] instead of [s, b, n*C], the graph
+        capture will produce a shape mismatch at the first hyper connection module.
+        """
+        layer, config = self._create_mhc_layer()
+        seq_length = 32
+        micro_batch_size = 2
+
+        static_inputs = layer.get_layer_static_inputs(seq_length, micro_batch_size)
+        hidden_states = static_inputs["hidden_states"]
+
+        expected_hidden_dim = config.num_residual_streams * config.hidden_size
+        assert hidden_states.shape[-1] == expected_hidden_dim, (
+            f"get_layer_static_inputs returns hidden dim {hidden_states.shape[-1]} "
+            f"but mHC expects {expected_hidden_dim} (n={config.num_residual_streams} * "
+            f"C={config.hidden_size}). "
+            f"HyperConnectionTransformerLayer must override get_layer_static_inputs."
+        )
+
+    def test_submodules_under_cudagraphs_includes_hyper_connection(self):
+        """_get_submodules_under_cudagraphs must include hyper connection modules.
+
+        CUDA graph manual hooks are set up for parameters of submodules returned
+        by this method. Missing hyper connection modules means their parameters
+        (mapping_proj, alpha_*, bias) will not get proper pre-forward hooks during
+        graph replay, leading to stale parameter values.
+        """
+        layer, config = self._create_mhc_layer()
+
+        submodules = layer._get_submodules_under_cudagraphs()
+
+        hc_modules_found = any(
+            hasattr(m, 'mapping_proj') for submod in submodules for m in submod.modules()
+        )
+        assert hc_modules_found, (
+            "_get_submodules_under_cudagraphs does not include HyperConnectionModule. "
+            "Parameters like mapping_proj, alpha_pre/post/res will not be updated "
+            "during CUDA graph replay."
+        )
+
+    def test_forward_through_te_cuda_graph_capture_path(self):
+        """_te_cuda_graph_capture must produce correct output shapes for mHC.
+
+        TE CUDA graph capture calls _te_cuda_graph_capture() during warmup.
+        For mHC layers, the input must be n-stream [s, b, n*C] and output must
+        also be [s, b, n*C].
+        """
+        layer, config = self._create_mhc_layer()
+        layer.eval()
+
+        seq_len = 8
+        batch_size = 2
+        n_channels = config.num_residual_streams * config.hidden_size
+
+        hidden_states = torch.randn(seq_len, batch_size, n_channels, device='cuda')
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        with torch.no_grad():
+            outputs = layer._te_cuda_graph_capture(
+                hidden_states=hidden_states, attention_mask=attention_mask
+            )
+
+        if isinstance(outputs, tuple):
+            output = outputs[0]
+        else:
+            output = outputs
+
+        assert output.shape == (seq_len, batch_size, n_channels), (
+            f"_te_cuda_graph_capture output shape {output.shape} != "
+            f"expected {(seq_len, batch_size, n_channels)}"
+        )
+
+    def test_cuda_graph_fwd_bwd_with_hyper_connection(self):
+        """End-to-end CUDA graph capture and replay for forward+backward with mHC.
+
+        Captures both the forward and backward pass of HyperConnectionTransformerLayer
+        into a torch.cuda.CUDAGraph and replays it with fresh input data, verifying
+        that the computation graph is fully static (capturable) and produces correct
+        output shapes and non-trivial gradients.
+        """
+        layer, config = self._create_mhc_layer()
+        layer.train()
+
+        seq_len = 8
+        batch_size = 2
+        n_channels = config.num_residual_streams * config.hidden_size
+
+        static_input = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        # Warmup on side stream to trigger lazy allocations
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            for _ in range(3):
+                out, _ = layer(hidden_states=static_input, attention_mask=attention_mask)
+                out.sum().backward()
+        torch.cuda.current_stream().wait_stream(s)
+
+        # Set .grad to None so backward allocates fresh gradient tensors in the
+        # graph's private memory pool during capture.
+        layer.zero_grad(set_to_none=True)
+        static_input.grad = None
+
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output, _ = layer(hidden_states=static_input, attention_mask=attention_mask)
+            output.sum().backward()
+
+        # Replay with new input data.
+        # Use no_grad because backward inside the captured graph already
+        # bumped the autograd version counter on static_input, making
+        # in-place copy_ illegal without disabling grad tracking.
+        with torch.no_grad():
+            static_input.copy_(torch.randn_like(static_input))
+        g.replay()
+
+        assert output.shape == (
+            seq_len,
+            batch_size,
+            n_channels,
+        ), f"Output shape {output.shape} != expected {(seq_len, batch_size, n_channels)}"
+        assert (
+            static_input.grad is not None
+        ), "Gradients should be computed for static_input after graph replay"
+        assert static_input.grad.shape == static_input.shape
+        assert static_input.grad.abs().sum() > 0, "Gradients should be non-trivial"
+
+        # Verify numerical consistency: graph replay should match eager execution
+        # with the same input and weights.
+        test_data = torch.randn(seq_len, batch_size, n_channels, device='cuda')
+
+        with torch.no_grad():
+            static_input.copy_(test_data)
+        g.replay()
+        graph_out = output.detach().clone()
+        graph_grad = static_input.grad.detach().clone()
+
+        eager_input = test_data.clone().requires_grad_(True)
+        eager_output, _ = layer(hidden_states=eager_input, attention_mask=attention_mask)
+        eager_output.sum().backward()
+
+        assert torch.allclose(graph_out, eager_output.detach(), atol=1e-5), (
+            f"Graph vs eager output mismatch: "
+            f"max diff = {(graph_out - eager_output.detach()).abs().max().item()}"
+        )
+        assert torch.allclose(graph_grad, eager_input.grad, atol=1e-5), (
+            f"Graph vs eager gradient mismatch: "
+            f"max diff = {(graph_grad - eager_input.grad).abs().max().item()}"
+        )
+
+    def test_cuda_graph_fwd_bwd_with_hyper_connection_and_recompute(self):
+        """CUDA graph capture+replay for fwd+bwd with mHC and CheckpointManager.
+
+        When a CheckpointManager is used, additional CheckpointWithoutOutput
+        objects are created for layernorm and hyper-connection operations. The
+        manager discards intermediate activations during forward (storage.resize_(0))
+        and recomputes them during backward via a unified gradient hook.
+        This test verifies the full capture+replay still works correctly.
+        """
+        layer, config = self._create_mhc_layer()
+        layer.train()
+
+        seq_len = 8
+        batch_size = 2
+        n_channels = config.num_residual_streams * config.hidden_size
+
+        static_input = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        # Warmup on side stream; fresh manager per iteration to avoid stale state.
+        s = torch.cuda.Stream()
+        s.wait_stream(torch.cuda.current_stream())
+        with torch.cuda.stream(s):
+            for _ in range(3):
+                mgr = CheckpointManager()
+                mgr.is_last_layer_in_recompute_block = True
+                out, _ = layer(
+                    hidden_states=static_input,
+                    attention_mask=attention_mask,
+                    mhc_recompute_manager=mgr,
+                )
+                mgr.discard_all_outputs_and_register_unified_recompute(out)
+                out.sum().backward()
+        torch.cuda.current_stream().wait_stream(s)
+
+        layer.zero_grad(set_to_none=True)
+        static_input.grad = None
+
+        capture_mgr = CheckpointManager()
+        capture_mgr.is_last_layer_in_recompute_block = True
+
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            output, _ = layer(
+                hidden_states=static_input,
+                attention_mask=attention_mask,
+                mhc_recompute_manager=capture_mgr,
+            )
+            capture_mgr.discard_all_outputs_and_register_unified_recompute(output)
+            output.sum().backward()
+
+        # Replay with new input data.
+        with torch.no_grad():
+            static_input.copy_(torch.randn_like(static_input))
+        g.replay()
+
+        assert output.shape == (
+            seq_len,
+            batch_size,
+            n_channels,
+        ), f"Output shape {output.shape} != expected {(seq_len, batch_size, n_channels)}"
+        assert (
+            static_input.grad is not None
+        ), "Gradients should be computed for static_input after graph replay"
+        assert static_input.grad.shape == static_input.shape
+        assert static_input.grad.abs().sum() > 0, "Gradients should be non-trivial"
+
+        # Numerical consistency: graph replay vs eager with the same input.
+        test_data = torch.randn(seq_len, batch_size, n_channels, device='cuda')
+
+        with torch.no_grad():
+            static_input.copy_(test_data)
+        g.replay()
+        graph_out = output.detach().clone()
+        graph_grad = static_input.grad.detach().clone()
+
+        eager_mgr = CheckpointManager()
+        eager_mgr.is_last_layer_in_recompute_block = True
+        eager_input = test_data.clone().requires_grad_(True)
+        eager_output, _ = layer(
+            hidden_states=eager_input,
+            attention_mask=attention_mask,
+            mhc_recompute_manager=eager_mgr,
+        )
+        eager_mgr.discard_all_outputs_and_register_unified_recompute(eager_output)
+        eager_output.sum().backward()
+
+        assert torch.allclose(graph_out, eager_output.detach(), atol=1e-5), (
+            f"Graph vs eager output mismatch: "
+            f"max diff = {(graph_out - eager_output.detach()).abs().max().item()}"
+        )
+        assert torch.allclose(graph_grad, eager_input.grad, atol=1e-5), (
+            f"Graph vs eager gradient mismatch: "
+            f"max diff = {(graph_grad - eager_input.grad).abs().max().item()}"
+        )
+
+    def test_mcore_cudagraph_manager_with_mhc_recompute_manager(self):
+        """MCore CudaGraphManager must not crash on mhc_recompute_manager kwarg.
+
+        When cuda_graph_impl="local" is set, TransformerLayer.__call__ routes
+        through MegatronModule.__call__ → CudaGraphManager.__call__, which
+        iterates over all kwargs to check supported types. CheckpointManager
+        (used by mhc_recompute_manager) is not a CUDA-graph-supported type.
+
+        This test verifies that mhc_recompute_manager is properly extracted
+        from kwargs before the CudaGraphManager sees them, preventing the
+        AssertionError that would otherwise occur.
+        """
+        layer, config = self._create_mhc_layer(cuda_graph_impl="local", cuda_graph_scope="attn")
+        layer.train()
+
+        assert hasattr(
+            layer, 'cudagraph_manager'
+        ), "Layer should have cudagraph_manager with cuda_graph_impl='local'"
+
+        seq_len = 8
+        batch_size = 2
+        n_channels = config.num_residual_streams * config.hidden_size
+
+        hidden_states = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        mgr = CheckpointManager()
+        mgr.is_last_layer_in_recompute_block = True
+
+        output, context = layer(
+            hidden_states=hidden_states, attention_mask=attention_mask, mhc_recompute_manager=mgr
+        )
+
+        assert output.shape == (seq_len, batch_size, n_channels)
+
+    def test_mcore_cudagraph_manager_without_mhc_recompute_manager(self):
+        """MCore CudaGraphManager path works when mhc_recompute_manager is None."""
+        layer, config = self._create_mhc_layer(cuda_graph_impl="local", cuda_graph_scope="attn")
+        layer.train()
+
+        seq_len = 8
+        batch_size = 2
+        n_channels = config.num_residual_streams * config.hidden_size
+
+        hidden_states = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        output, context = layer(hidden_states=hidden_states, attention_mask=attention_mask)
+
+        assert output.shape == (seq_len, batch_size, n_channels)
+
+
+class TestMHCWithOffloading:
+    """Test HyperConnectionTransformerLayer with fine-grained activation offloading.
+
+    Fine-grained activation offloading transfers specific activations (e.g., layernorm
+    inputs) to CPU during forward and reloads them during backward. These tests verify
+    that the mHC layer's multi-stream architecture works correctly with offloading.
+    """
+
+    def setup_method(self, method):
+        Utils.initialize_model_parallel(1, 1)
+        model_parallel_cuda_manual_seed(123)
+
+    def teardown_method(self, method):
+        Utils.destroy_model_parallel()
+
+    def _create_mhc_layer_with_offloading(
+        self, hidden_size=64, num_streams=4, offload_modules=None
+    ):
+        if offload_modules is None:
+            offload_modules = ["attn_norm", "mlp_norm"]
+
+        config = _make_mhc_config(
+            hidden_size=hidden_size,
+            num_streams=num_streams,
+            fine_grained_activation_offloading=True,
+            offload_modules=offload_modules,
+        )
+        layer_spec = get_gpt_layer_with_transformer_engine_spec(enable_hyper_connection=True)
+        layer = HyperConnectionTransformerLayer(config, layer_spec.submodules)
+        layer.cuda()
+        return layer, config
+
+    def test_forward_backward_with_offloading(self):
+        """Forward+backward should work with activation offloading enabled.
+
+        This exercises the off_interface context manager around layernorms in
+        the mHC forward path, including the group_commit that commits the
+        offloading group for the aggregated 1-stream layernorm input.
+        """
+        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+            PipelineOffloadManager,
+        )
+
+        layer, config = self._create_mhc_layer_with_offloading()
+        layer.train()
+
+        seq_len = 8
+        batch_size = 2
+        n_channels = config.num_residual_streams * config.hidden_size
+
+        hidden_states = torch.randn(
+            seq_len, batch_size, n_channels, device='cuda', requires_grad=True
+        )
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        mgr = PipelineOffloadManager.get_instance()
+        mgr.init_model_chunk_offload_handler(
+            pp_rank=0, vp_size=1, vp_stage=0, min_offloaded_tensor_size=0
+        )
+
+        output, context = layer(hidden_states=hidden_states, attention_mask=attention_mask)
+
+        assert output.shape == (
+            seq_len,
+            batch_size,
+            n_channels,
+        ), f"Output shape {output.shape} != expected {(seq_len, batch_size, n_channels)}"
+
+        loss = output.sum()
+        loss.backward()
+
+        assert hidden_states.grad is not None, "Gradients should flow through offloaded path"
+        assert hidden_states.grad.shape == hidden_states.shape
+        assert hidden_states.grad.abs().sum() > 0, "Gradients should be non-trivial"
+
+        PipelineOffloadManager.reset_instance()
+
+    def test_offloading_numerical_equivalence(self):
+        """Offloaded forward+backward must produce the same result as non-offloaded.
+
+        Compares outputs and gradients between a layer with offloading disabled
+        vs enabled to ensure the offloading path does not corrupt activations.
+        """
+        from megatron.core.pipeline_parallel.fine_grained_activation_offload import (
+            PipelineOffloadManager,
+        )
+
+        PipelineOffloadManager.reset_instance()
+
+        hidden_size = 64
+        num_streams = 4
+        seq_len = 8
+        batch_size = 2
+        n_channels = num_streams * hidden_size
+
+        torch.manual_seed(42)
+        input_data = torch.randn(seq_len, batch_size, n_channels, device='cuda')
+        attention_mask = torch.ones((1, 1, seq_len, seq_len), dtype=bool, device='cuda')
+
+        # Run without offloading
+        config_no_offload = _make_mhc_config(hidden_size=hidden_size, num_streams=num_streams)
+        layer_spec = get_gpt_layer_with_transformer_engine_spec(enable_hyper_connection=True)
+        layer_no_offload = HyperConnectionTransformerLayer(
+            config_no_offload, layer_spec.submodules
+        ).cuda()
+        layer_no_offload.train()
+
+        h1 = input_data.clone().detach().requires_grad_(True)
+        out1, _ = layer_no_offload(hidden_states=h1, attention_mask=attention_mask)
+        out1.sum().backward()
+        grad_no_offload = h1.grad.clone()
+        out1_detached = out1.detach().clone()
+
+        # Run with offloading using the same weights
+        config_offload = _make_mhc_config(
+            hidden_size=hidden_size,
+            num_streams=num_streams,
+            fine_grained_activation_offloading=True,
+            offload_modules=["attn_norm", "mlp_norm"],
+        )
+        layer_offload = HyperConnectionTransformerLayer(
+            config_offload, layer_spec.submodules
+        ).cuda()
+        layer_offload.load_state_dict(layer_no_offload.state_dict())
+        layer_offload.train()
+
+        mgr = PipelineOffloadManager.get_instance()
+        mgr.init_model_chunk_offload_handler(
+            pp_rank=0, vp_size=1, vp_stage=0, min_offloaded_tensor_size=0
+        )
+
+        h2 = input_data.clone().detach().requires_grad_(True)
+        out2, _ = layer_offload(hidden_states=h2, attention_mask=attention_mask)
+        out2.sum().backward()
+        grad_offload = h2.grad.clone()
+
+        PipelineOffloadManager.reset_instance()
+
+        assert torch.allclose(out1_detached, out2.detach(), atol=1e-5), (
+            f"Forward outputs differ: max diff = "
+            f"{(out1_detached - out2.detach()).abs().max().item()}"
+        )
+        assert torch.allclose(grad_no_offload, grad_offload, atol=1e-5), (
+            f"Gradients differ: max diff = "
+            f"{(grad_no_offload - grad_offload).abs().max().item()}"
+        )
diff --git a/tests/unit_tests/transformer/test_vision_cuda_graphs.py b/tests/unit_tests/transformer/test_vision_cuda_graphs.py
index bfd431e67a3..a7c6cf8de65 100644
--- a/tests/unit_tests/transformer/test_vision_cuda_graphs.py
+++ b/tests/unit_tests/transformer/test_vision_cuda_graphs.py
@@ -403,6 +403,7 @@ def test_create_cudagraphs_multi_microbatch(self):
 
         helper.delete_cuda_graphs()
 
+    @pytest.mark.flaky_in_dev
     def test_create_cudagraphs_no_callables_is_noop(self):
         """create_cudagraphs on empty helper should not crash."""
         dummy_model = torch.nn.Linear(4, 4)
@@ -610,6 +611,8 @@ def test_pp2_create_cudagraphs_first_stage(self):
         for layer in helper.callables:
             assert layer.cuda_graphs == []
 
+    @pytest.mark.flaky
+    @pytest.mark.flaky_in_dev
     @pytest.mark.skipif(
         not (HAVE_TE_GRAPHS and is_te_min_version("2.7.0")),
         reason="TE CUDA graph capture requires TransformerEngine >= 2.7.0",
diff --git a/tools/bert_embedding/embed.py b/tools/bert_embedding/embed.py
index 84b6a55480e..4504349635d 100644
--- a/tools/bert_embedding/embed.py
+++ b/tools/bert_embedding/embed.py
@@ -1,24 +1,25 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 
+import os
+import time
 from functools import partial
 from types import SimpleNamespace
 from typing import Callable, Dict, List, Optional, Tuple, TypedDict
+
 import numpy as np
-import os
-import time
 import torch
 from torch.distributed import ProcessGroup
 from torch.utils.data import BatchSampler, DataLoader, SequentialSampler, Subset
 from torch.utils.data._utils.collate import default_collate
 
-from megatron.training import get_args, get_tokenizer, print_rank_0
 from megatron import core
-from megatron.training.arguments import core_transformer_config_from_args
 from megatron.core import parallel_state
 from megatron.core.enums import ModelType
 from megatron.core.pipeline_parallel import get_forward_backward_func
+from megatron.training import get_args, get_tokenizer, print_rank_0
+from megatron.training.arguments import core_transformer_config_from_args
 from megatron.training.training import setup_model_and_optimizer
-from pretrain_bert import model_provider, get_batch, loss_func, forward_step
+from pretrain_bert import forward_step, get_batch, loss_func, model_provider
 
 from .dataset import BertEmbeddingDataset
 from .external_libs import h5py
@@ -51,29 +52,29 @@ def collate_batch(samples):
     tokenizer = get_tokenizer()
 
     # Max sample length across all samples.
-    max_length_map = { key:0 for key in keys }
+    max_length_map = {key: 0 for key in keys}
     for sample in samples:
         for key in keys:
-            value_length = \
-                len(sample[key]) if isinstance(sample[key], np.ndarray) else None
-            max_length_map[key] = None \
-                if value_length is None else \
-                   max(max_length_map[key], value_length)
+            value_length = len(sample[key]) if isinstance(sample[key], np.ndarray) else None
+            max_length_map[key] = (
+                None if value_length is None else max(max_length_map[key], value_length)
+            )
 
     # Pad samples.
     padded_samples = []
     for sample in samples:
         padded_sample = {}
         for key in keys:
-            padded_sample[key] = \
+            padded_sample[key] = (
                 np.pad(
                     sample[key],
                     (0, max_length_map[key] - len(sample[key])),
                     mode="constant",
                     constant_values=tokenizer.pad_id if key == "text" else 0,
-                ) \
-                if isinstance(sample[key], np.ndarray) else \
-                   sample[key]
+                )
+                if isinstance(sample[key], np.ndarray)
+                else sample[key]
+            )
         padded_samples.append(padded_sample)
 
     # Build batch with padded samples.
@@ -93,17 +94,17 @@ def get_data_loader(dataset, batch_size):
 
     # Sequential & batch samplers.
     batch_sampler = BatchSampler(
-        sampler=SequentialSampler(dataset),
-        batch_size=batch_size,
-        drop_last=False,
+        sampler=SequentialSampler(dataset), batch_size=batch_size, drop_last=False
     )
 
     # Data loader.
-    data_loader = DataLoader(dataset,
-                             batch_sampler=batch_sampler,
-                             num_workers=args.num_workers,
-                             pin_memory=True,
-                             collate_fn=collate_batch)
+    data_loader = DataLoader(
+        dataset,
+        batch_sampler=batch_sampler,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        collate_fn=collate_batch,
+    )
 
     return data_loader
 
@@ -113,9 +114,9 @@ def embed_data_loader(models, data_loader, tag):
 
     # Verify no model parallelism.
     args = get_args()
-    assert args.tensor_model_parallel_size == 1 and \
-        args.pipeline_model_parallel_size == 1, \
-        "since we call forward_step directly, only tp == pp == 1 allowed."
+    assert (
+        args.tensor_model_parallel_size == 1 and args.pipeline_model_parallel_size == 1
+    ), "since we call forward_step directly, only tp == pp == 1 allowed."
 
     # Data iterator.
     data_iterator = iter(data_loader)
@@ -372,9 +373,9 @@ def __init__(self, batch_size, max_bert_seq_length, embedder_type, warmup=True):
 
         assert args.output_bert_embeddings
 
-        self.models, optimizer, opt_param_scheduler = \
-            setup_model_and_optimizer(model_provider,
-                                      ModelType.encoder_or_decoder)
+        self.models, optimizer, opt_param_scheduler = setup_model_and_optimizer(
+            model_provider, ModelType.encoder_or_decoder
+        )
         self.batch_size = batch_size
         self.max_bert_seq_length = max_bert_seq_length
 
@@ -382,8 +383,7 @@ def __init__(self, batch_size, max_bert_seq_length, embedder_type, warmup=True):
         if embedder_type == "megatron":
             self.huggingface_embedder = None
         elif embedder_type == "huggingface":
-            self.huggingface_embedder = HuggingfaceEmbedder(batch_size,
-                                                            max_bert_seq_length)
+            self.huggingface_embedder = HuggingfaceEmbedder(batch_size, max_bert_seq_length)
         else:
             raise Exception("specialize for embedder type '%s'." % embedder_type)
 
@@ -392,18 +392,20 @@ def __init__(self, batch_size, max_bert_seq_length, embedder_type, warmup=True):
         #   1. batch_size == 1
         #   2. batch_size > 1
         if warmup:
-            warmup_dataset = TextDataset([
-                "great fleas have lesser fleas, upon their backs to bite’em,",
-                "and lesser fleas have lesser fleas, and so, ad infinitum,",
-                "and those great fleas, themselves, in turn have greater fleas to go on,",
-                "while those again have greater still, and greater still, and so on.",
-            ])
+            warmup_dataset = TextDataset(
+                [
+                    "great fleas have lesser fleas, upon their backs to bite’em,",
+                    "and lesser fleas have lesser fleas, and so, ad infinitum,",
+                    "and those great fleas, themselves, in turn have greater fleas to go on,",
+                    "while those again have greater still, and greater still, and so on.",
+                ]
+            )
             print_rank_0("bert / warmup single.")
             for _ in range(3):
-                self.embed_text("hi, bert.")            # batch size == 1
+                self.embed_text("hi, bert.")  # batch size == 1
             print_rank_0("bert / warmup batch.")
             for _ in range(3):
-                self.embed_text_dataset(warmup_dataset) # batch size > 1
+                self.embed_text_dataset(warmup_dataset)  # batch size > 1
 
     def embed_text_dataset(self, text_dataset, tag=None):
         '''Embed a text dataset.'''
@@ -413,8 +415,7 @@ def embed_text_dataset(self, text_dataset, tag=None):
             return self.huggingface_embedder.embed_text_dataset(text_dataset)
 
         # Wrap in a BertEmbeddingDataset to tokenize samples.
-        bert_dataset = BertEmbeddingDataset(text_dataset,
-                                            self.max_bert_seq_length)
+        bert_dataset = BertEmbeddingDataset(text_dataset, self.max_bert_seq_length)
 
         # Embed.
         data_loader = get_data_loader(bert_dataset, self.batch_size)
@@ -430,7 +431,7 @@ def embed_text(self, text):
         '''
 
         # Embed text.
-        text_ds = TextDataset([ text ])
+        text_ds = TextDataset([text])
         embed = self.embed_text_dataset(text_ds)[0]
 
         return embed
@@ -444,8 +445,7 @@ def __init__(self, embedder, block_size):
         self.embedder = embedder
         self.block_size = block_size
 
-    def embed_text_blocks(self, name, dirname, text_dataset,
-                          missing_embedding_blocks):
+    def embed_text_blocks(self, name, dirname, text_dataset, missing_embedding_blocks):
         '''Process a text dataset in blocks.'''
 
         # Iterate blocks.
@@ -456,12 +456,10 @@ def embed_text_blocks(self, name, dirname, text_dataset,
             if block_info is not None:
 
                 # Progress. (*note*: move world progress to here.)
-                print_rank_0("embed '%s' block %d / %d ... %s." % (
-                    name,
-                    block_index,
-                    len(missing_embedding_blocks),
-                    block_info["path"],
-                ))
+                print_rank_0(
+                    "embed '%s' block %d / %d ... %s."
+                    % (name, block_index, len(missing_embedding_blocks), block_info["path"])
+                )
 
                 # Embed block.
                 sub_dataset = Subset(text_dataset, range(*block_info["range"]))
@@ -485,11 +483,8 @@ def embed_text_dataset(self, name, dirname, text_dataset):
         # Missing embedding blocks (stored on disk).
         def validate(f):
             assert f["data"].shape[1] == 1024
-        blocks = get_blocks_by_rank(
-            dirname,
-            len(text_dataset),
-            self.block_size,
-            validate=validate)
+
+        blocks = get_blocks_by_rank(dirname, len(text_dataset), self.block_size, validate=validate)
 
         # Prevent missing file race condition.
         torch.distributed.barrier()
diff --git a/tools/build_sequences_per_dataset.py b/tools/build_sequences_per_dataset.py
index 9213a0e1f53..cb3d2cb1ced 100644
--- a/tools/build_sequences_per_dataset.py
+++ b/tools/build_sequences_per_dataset.py
@@ -10,12 +10,12 @@
 
 import argparse
 import json
-from typing import Optional, Tuple, List
-
+from typing import List, Optional, Tuple
 
 from megatron.core.datasets.indexed_dataset import _IndexReader
 from megatron.training.utils import get_blend_and_blend_per_split
 
+
 def get_paths_from_blend(
     blend: Optional[Tuple[List[str], Optional[List[float]]]],
     blend_per_split: Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]],
@@ -26,27 +26,27 @@ def get_paths_from_blend(
         blend (Optional[Tuple[List[str], Optional[List[float]]]]): A blend tuple containing
             a list of dataset paths and optionally a list of weights, e.g.,
             (["path/to/dataset_1", "path/to/dataset_2"], [0.3, 0.7])
-        blend_per_split (Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]]): 
-            A list of 3 blend tuples (for train, valid, test splits), where each element has 
+        blend_per_split (Optional[List[Optional[Tuple[List[str], Optional[List[float]]]]]]):
+            A list of 3 blend tuples (for train, valid, test splits), where each element has
             the same structure as blend
 
     Returns:
         List[str]: A list of all unique dataset paths found in blend and blend_per_split
     """
     paths = []
-    
+
     # Extract paths from blend
     if blend is not None:
         paths_list, _ = blend
         paths.extend(paths_list)
-    
+
     # Extract paths from blend_per_split
     if blend_per_split is not None:
         for split_blend in blend_per_split:
             if split_blend is not None:
                 split_paths, _ = split_blend
                 paths.extend(split_paths)
-    
+
     # Remove duplicates while preserving order
     seen = set()
     unique_paths = []
@@ -54,9 +54,10 @@ def get_paths_from_blend(
         if path not in seen:
             seen.add(path)
             unique_paths.append(path)
-    
+
     return unique_paths
 
+
 def build_sequences_per_dataset(args):
     print("Building sequences per dataset...")
 
@@ -75,38 +76,67 @@ def build_sequences_per_dataset(args):
 
     return sequence_count_dict
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--data-path', nargs='*', default=None,
-                       help='The weight and prefix list for a set of train, validation, and test'
-                       'datasets which split according to --split. The accepted formats are: '
-                       '(1) a single prefix, '
-                       '(2) a list of weight prefix pairs e.g. weight1 prefix1 weight2 prefix2, '
-                       '(3) a list of prefixes e.g. prefix1 prefix2. '
-                       'For (3), weights are inferred from the lengths of the contributing datasets. '
-                       'This argument is exclusive to the other independent --*-data-path arguments.')
-    parser.add_argument('--train-data-path', nargs='*', default=None,
-                       help='The weight and prefix list for an independent train dataset. '
-                       'Follows the same pattern rules as --data-path.')
-    parser.add_argument('--valid-data-path', nargs='*', default=None,
-                       help='The weight and prefix list for an independent validation dataset. '
-                       'Follows the same pattern rules as --data-path.')
-    parser.add_argument('--test-data-path', nargs='*', default=None,
-                       help='The weight and prefix list for an independent test dataset. '
-                       'Follows the same pattern rules as --data-path.')
-    parser.add_argument('--data-args-path', type=str, default=None,
-                       help='Path to data-args. Instead of feeding `--data-path` '
-                       'with weighted dataset, we pass in a file path from which '
-                       'we read that argument. This is useful when the list of data is '
-                       'too big.')
-    parser.add_argument('--per-split-data-args-path', type=str, default=None,
-                       help='Path to per-split-data-args. Instead of feeding '
-                       '`--(train|valid|test)-data-path` with weighted dataset, '
-                       'we pass in a file path from which we read those arguments. '
-                       'This is useful when the list of data is too big. Format is a '
-                       'json file with `train`, `valid`, `test` keys')
-    parser.add_argument('--per-dataset-sequences-path', type=str, required=True,
-                       help='Path to the output json file with the sequences per dataset.')
+    parser.add_argument(
+        '--data-path',
+        nargs='*',
+        default=None,
+        help='The weight and prefix list for a set of train, validation, and test'
+        'datasets which split according to --split. The accepted formats are: '
+        '(1) a single prefix, '
+        '(2) a list of weight prefix pairs e.g. weight1 prefix1 weight2 prefix2, '
+        '(3) a list of prefixes e.g. prefix1 prefix2. '
+        'For (3), weights are inferred from the lengths of the contributing datasets. '
+        'This argument is exclusive to the other independent --*-data-path arguments.',
+    )
+    parser.add_argument(
+        '--train-data-path',
+        nargs='*',
+        default=None,
+        help='The weight and prefix list for an independent train dataset. '
+        'Follows the same pattern rules as --data-path.',
+    )
+    parser.add_argument(
+        '--valid-data-path',
+        nargs='*',
+        default=None,
+        help='The weight and prefix list for an independent validation dataset. '
+        'Follows the same pattern rules as --data-path.',
+    )
+    parser.add_argument(
+        '--test-data-path',
+        nargs='*',
+        default=None,
+        help='The weight and prefix list for an independent test dataset. '
+        'Follows the same pattern rules as --data-path.',
+    )
+    parser.add_argument(
+        '--data-args-path',
+        type=str,
+        default=None,
+        help='Path to data-args. Instead of feeding `--data-path` '
+        'with weighted dataset, we pass in a file path from which '
+        'we read that argument. This is useful when the list of data is '
+        'too big.',
+    )
+    parser.add_argument(
+        '--per-split-data-args-path',
+        type=str,
+        default=None,
+        help='Path to per-split-data-args. Instead of feeding '
+        '`--(train|valid|test)-data-path` with weighted dataset, '
+        'we pass in a file path from which we read those arguments. '
+        'This is useful when the list of data is too big. Format is a '
+        'json file with `train`, `valid`, `test` keys',
+    )
+    parser.add_argument(
+        '--per-dataset-sequences-path',
+        type=str,
+        required=True,
+        help='Path to the output json file with the sequences per dataset.',
+    )
     args = parser.parse_args()
 
     sequence_count_dict = build_sequences_per_dataset(args)
@@ -114,4 +144,4 @@ def build_sequences_per_dataset(args):
     with open(args.per_dataset_sequences_path, "w") as f:
         json.dump(sequence_count_dict, f)
 
-    print(f"Done! Saving --per-dataset-sequences-path file to {args.per_dataset_sequences_path}")
\ No newline at end of file
+    print(f"Done! Saving --per-dataset-sequences-path file to {args.per_dataset_sequences_path}")
diff --git a/tools/checkpoint/checkpoint_inspector.py b/tools/checkpoint/checkpoint_inspector.py
index 74da955912f..9033ff96a17 100644
--- a/tools/checkpoint/checkpoint_inspector.py
+++ b/tools/checkpoint/checkpoint_inspector.py
@@ -11,12 +11,12 @@
 import io
 import json
 import os
-from pathlib import Path
-import time
 import re
 import shutil
-from typing import Optional
 import tempfile
+import time
+from pathlib import Path
+from typing import Optional
 
 import click
 import torch
@@ -29,23 +29,20 @@
     FileSystemWriter,
 )
 from torch.distributed.checkpoint.format_utils import dcp_to_torch_save
-from torch.distributed.checkpoint.metadata import (
-    BytesStorageMetadata,
-    TensorStorageMetadata,
-)
+from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata
 from torch.distributed.checkpoint.state_dict_saver import _save_state_dict
 from torch.distributed.tensor import DeviceMesh, Replicate, Shard
 
-from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import split_dtensor, redistribute_uneven_dtensor_to_replicated
-
 from megatron.core.dist_checkpointing.strategies.common import load_common
 from megatron.core.dist_checkpointing.strategies.fully_parallel import (
     FullyParallelLoadStrategyWrapper,
 )
-from megatron.core.dist_checkpointing.strategies.torch import (
-    TorchDistLoadShardedStrategy,
-)
+from megatron.core.dist_checkpointing.strategies.torch import TorchDistLoadShardedStrategy
 from megatron.core.dist_checkpointing.validation import verify_checkpoint
+from megatron.core.distributed.fsdp.src.megatron_fsdp.uneven_dtensor import (
+    redistribute_uneven_dtensor_to_replicated,
+    split_dtensor,
+)
 from megatron.core.msc_utils import MultiStorageClientFeature
 
 
@@ -70,7 +67,9 @@ def cli():
 @cli.command()
 @click.argument("checkpoint_dir", type=click.Path(exists=True))
 @click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.")
-@click.option("--not-ignore-param-to-group-meta", is_flag=True, help="Ignore parameter-to-group metadata.")
+@click.option(
+    "--not-ignore-param-to-group-meta", is_flag=True, help="Ignore parameter-to-group metadata."
+)
 def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta):
     """Inspect a Megatron Core Distributed Checkpoint"""
     ckpt_path = Path(checkpoint_dir)
@@ -82,18 +81,12 @@ def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta):
     metadata_json = ckpt_path / "metadata.json"
     if not metadata_json.exists():
         click.echo(
-            click.style(
-                "Metadata file not found in the checkpoint directory.",
-                fg="red",
-                bold=True,
-            )
+            click.style("Metadata file not found in the checkpoint directory.", fg="red", bold=True)
         )
     else:
         metadata_json = json.loads(metadata_json.read_text())
         print_header("checkpoint metadata", "blue")
-        click.echo(
-            click.style(json.dumps(metadata_json, indent=2), fg="bright_magenta")
-        )
+        click.echo(click.style(json.dumps(metadata_json, indent=2), fg="bright_magenta"))
 
     try:
         # Strategies initialization
@@ -102,9 +95,7 @@ def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta):
         verify_checkpoint(checkpoint_dir)
         assert isinstance(
             sharded_strategy.base_strategy, TorchDistLoadShardedStrategy
-        ), click.style(
-            f"Unsupported sharded strategy: {sharded_strategy}", fg="red", bold=True
-        )
+        ), click.style(f"Unsupported sharded strategy: {sharded_strategy}", fg="red", bold=True)
 
         # Common state section
         common_state = load_common(checkpoint_dir)
@@ -115,17 +106,14 @@ def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta):
                 f"  {bullet} {click.style(key, fg='green')}: {click.style(str(value), fg='white')}"
             )
     except:
-        click.echo(
-            click.style("Failed to load checkpoint strategies.", fg="red", bold=True)
-        )
+        click.echo(click.style("Failed to load checkpoint strategies.", fg="red", bold=True))
 
     # Tensor metadata section
     reader = FileSystemReader(ckpt_path)
     metadata = reader.read_metadata()
-    total_tensors = len([
-        v for v in metadata.state_dict_metadata.values()
-        if isinstance(v, TensorStorageMetadata)
-    ])
+    total_tensors = len(
+        [v for v in metadata.state_dict_metadata.values() if isinstance(v, TensorStorageMetadata)]
+    )
     total_elements = sum(
         v.size.numel()
         for v in metadata.state_dict_metadata.values()
@@ -134,12 +122,8 @@ def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta):
 
     print_header("sharded tensors metadata", "yellow")
     stats = [
-        click.style(
-            f"Total Tensors: {total_tensors}", fg="bright_magenta"
-        ),
-        click.style(
-            f"Total Elements: {total_elements / 1e9:.2f}B", fg="bright_magenta"
-        ),
+        click.style(f"Total Tensors: {total_tensors}", fg="bright_magenta"),
+        click.style(f"Total Elements: {total_elements / 1e9:.2f}B", fg="bright_magenta"),
     ]
     click.echo(" | ".join(stats) + "\n")
 
@@ -159,12 +143,13 @@ def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta):
                 continue
             click.echo(f"  {bullet} {key_styled} {click.style('[BYTES]', fg='yellow')}")
         else:
-            click.echo(
-                f"  {bullet} {key_styled} {click.style('[UNKNOWN TYPE]', fg='red')}"
-            )
+            click.echo(f"  {bullet} {key_styled} {click.style('[UNKNOWN TYPE]', fg='red')}")
     if ignore_param_to_group_meta:
         click.echo(
-            click.style(f"Ignored parameter-to-group metadata: {ignore_param_to_group_meta_count}", fg="yellow")
+            click.style(
+                f"Ignored parameter-to-group metadata: {ignore_param_to_group_meta_count}",
+                fg="yellow",
+            )
         )
 
     # MCore data section
@@ -177,9 +162,7 @@ def inspect(checkpoint_dir, enable_msc, not_ignore_param_to_group_meta):
                 f"  {bullet} {click.style(key, fg='blue')}: {click.style(str(value), fg='white')}"
             )
     except:
-        click.echo(
-            click.style("No MCore data found in the checkpoint.", fg="red", bold=True)
-        )
+        click.echo(click.style("No MCore data found in the checkpoint.", fg="red", bold=True))
         pass
 
 
@@ -197,9 +180,7 @@ def print_tensor(checkpoint_dir, key):
     print_header("tensor metadata", "green")
     if key not in metadata.state_dict_metadata:
         click.echo(
-            click.style(
-                f"Key '{key}' not found in checkpoint metadata.", fg="red", bold=True
-            )
+            click.style(f"Key '{key}' not found in checkpoint metadata.", fg="red", bold=True)
         )
         return
 
@@ -207,9 +188,7 @@ def print_tensor(checkpoint_dir, key):
     if isinstance(tensor_metadata, TensorStorageMetadata):
         click.echo(click.style(f"Key: {key}", fg="blue"))
         click.echo(click.style(f"Shape: {tensor_metadata.size}", fg="cyan"))
-        click.echo(
-            click.style(f"Dtype: {tensor_metadata.properties.dtype}", fg="magenta")
-        )
+        click.echo(click.style(f"Dtype: {tensor_metadata.properties.dtype}", fg="magenta"))
     elif isinstance(tensor_metadata, BytesStorageMetadata):
         click.echo(click.style(f"Key: {key} (Bytes Storage)", fg="blue"))
     else:
@@ -264,9 +243,7 @@ def check_gpu_memory(threshold=0.9):
     near_full = allocated_ratio >= threshold or reserved_ratio >= threshold
 
     if near_full and torch.distributed.get_rank() == 0:
-        print(
-            f"GPU Memory: Allocated: {allocated_ratio:.2%}, Reserved: {reserved_ratio:.2%}"
-        )
+        print(f"GPU Memory: Allocated: {allocated_ratio:.2%}, Reserved: {reserved_ratio:.2%}")
     return near_full
 
 
@@ -315,9 +292,7 @@ class VerboseLoadPlanner(DefaultLoadPlanner):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
-    def set_up_planner(
-        self, state_dict, metadata, is_coordinator: bool = False
-    ) -> None:
+    def set_up_planner(self, state_dict, metadata, is_coordinator: bool = False) -> None:
         self.__total_items = len(state_dict)
         self.__resolve_items = {}
         super().set_up_planner(state_dict, metadata, is_coordinator)
@@ -395,14 +370,11 @@ def convert_checkpoint(
     for key, md in metadata.state_dict_metadata.items():
         if isinstance(md, TensorStorageMetadata):
             # Initialize tensor storage
-            assert len(md.size) > 0, (
-                f"Expected size for key '{key}' to be non-empty, got {md.size}."
-            )
+            assert (
+                len(md.size) > 0
+            ), f"Expected size for key '{key}' to be non-empty, got {md.size}."
             state_dict[key] = torch.distributed.tensor.empty(
-                md.size,
-                dtype=md.properties.dtype,
-                device_mesh=device_mesh,
-                placements=[Shard(0)],
+                md.size, dtype=md.properties.dtype, device_mesh=device_mesh, placements=[Shard(0)]
             )
         elif isinstance(md, BytesStorageMetadata):
             # Initialize bytes storage
@@ -425,7 +397,7 @@ def convert_checkpoint(
         _swiglu_prefixes = None  # None = match everything (backward compatible)
         rank0_echo("[SWiGLU] Global splitting enabled (all modules).")
     else:
-        _swiglu_prefixes = []    # no SWiGLU splitting
+        _swiglu_prefixes = []  # no SWiGLU splitting
         rank0_echo("[SWiGLU] Disabled (no --swiglu or --swiglu-modules specified).")
 
     # --- Auto-detect MTP from checkpoint keys ---
@@ -436,8 +408,7 @@ def convert_checkpoint(
     _detected = []
 
     if not rename_mtp_keys and any(
-        ".mtp.layers." in k and ".transformer_layer." in k
-        for k in all_keys
+        ".mtp.layers." in k and ".transformer_layer." in k for k in all_keys
     ):
         rename_mtp_keys = True
         _detected.append("MTP (transformer_layer -> mtp_model_layer rename needed)")
@@ -465,9 +436,7 @@ def _free_up_some_gpu_memory():
             torch.cuda.empty_cache()
 
     def split_layers(
-        key: str,
-        value: torch.Tensor,
-        orig_shape: Optional[torch.Size] = None,
+        key: str, value: torch.Tensor, orig_shape: Optional[torch.Size] = None
     ) -> dict[str, torch.Tensor]:
         """
         Split layers into separate tensors.
@@ -475,9 +444,11 @@ def split_layers(
         _free_up_some_gpu_memory()
         layers = {}
         for i, v in enumerate(split_dtensor(value, 1, dim=0)):
-            v = redistribute_uneven_dtensor_to_replicated(v).reshape(
-                orig_shape[1:] if orig_shape else value.shape[1:]
-            ).redistribute(placements=[Shard(0)])
+            v = (
+                redistribute_uneven_dtensor_to_replicated(v)
+                .reshape(orig_shape[1:] if orig_shape else value.shape[1:])
+                .redistribute(placements=[Shard(0)])
+            )
 
             layer_key = key.replace(".layers.", f".layers.{i}.")
             layers[layer_key] = v
@@ -485,9 +456,7 @@ def split_layers(
         return layers
 
     def split_expert_weights(
-        key: str,
-        value: torch.Tensor,
-        orig_shape: Optional[torch.Size] = None,
+        key: str, value: torch.Tensor, orig_shape: Optional[torch.Size] = None
     ) -> dict[str, torch.Tensor]:
         """
         Split expert weights into separate tensors for each expert.
@@ -601,9 +570,9 @@ def has_layer_index(key: str) -> bool:
                 and "nd_reformulated_orig_global_shape" in metadata.mcore_data[key]
             ):
                 mcore_data = metadata.mcore_data[key]
-                assert len(mcore_data) == 1, (
-                    f"Expected exactly one reformulated shape for key '{key}'."
-                )
+                assert (
+                    len(mcore_data) == 1
+                ), f"Expected exactly one reformulated shape for key '{key}'."
                 # Get the original global shape from mcore_data
                 orig_shape = mcore_data["nd_reformulated_orig_global_shape"]
                 metadata.mcore_data[key] = {}
@@ -656,10 +625,7 @@ def has_layer_index(key: str) -> bool:
             # Skip RNG states
             continue
         elif key.startswith("rerun_state_machine_state"):
-            if (
-                rerun_state_machine_state is not None
-                and torch.distributed.get_rank() == 0
-            ):
+            if rerun_state_machine_state is not None and torch.distributed.get_rank() == 0:
                 click.echo(
                     click.style(
                         "Warning: Multiple rerun_state_machine_state found, only the first one will be saved.",
@@ -688,7 +654,9 @@ def has_layer_index(key: str) -> bool:
     if _swiglu_split_count > 0:
         rank0_echo(f"[SWiGLU] Split {_swiglu_split_count} fc1 keys into _w/_v pairs.")
     elif _swiglu_prefixes is not None and len(_swiglu_prefixes) > 0:
-        rank0_echo("[SWiGLU] WARNING: modules specified but 0 keys were split — check module names.")
+        rank0_echo(
+            "[SWiGLU] WARNING: modules specified but 0 keys were split — check module names."
+        )
 
     # Rename MTP keys: torch_dist uses "transformer_layer" for MTP sub-modules,
     # but the FSDP model's state_dict() uses "mtp_model_layer".
@@ -705,8 +673,10 @@ def has_layer_index(key: str) -> bool:
                     param_to_param_group_map[new_k] = param_to_param_group_map.pop(k)
                 renamed_count += 1
         if renamed_count > 0:
-            rank0_echo(f"[MTP rename] Renamed {renamed_count} keys: "
-                       f"'transformer_layer' -> 'mtp_model_layer'.")
+            rank0_echo(
+                f"[MTP rename] Renamed {renamed_count} keys: "
+                f"'transformer_layer' -> 'mtp_model_layer'."
+            )
 
     # Move back to GPU if necessary
     for key in fsdp_dtensor_state_dict:
@@ -730,10 +700,8 @@ def has_layer_index(key: str) -> bool:
     sharded_strategy = TorchDistLoadShardedStrategy()
     sharded_strategy = FullyParallelLoadStrategyWrapper(sharded_strategy)
     verify_checkpoint(str(input_dir))
-    assert isinstance(sharded_strategy.base_strategy, TorchDistLoadShardedStrategy), (
-        click.style(
-            f"Unsupported sharded strategy: {sharded_strategy}", fg="red", bold=True
-        )
+    assert isinstance(sharded_strategy.base_strategy, TorchDistLoadShardedStrategy), click.style(
+        f"Unsupported sharded strategy: {sharded_strategy}", fg="red", bold=True
     )
     common_state = load_common(input_dir)
     try:
@@ -748,12 +716,10 @@ def has_layer_index(key: str) -> bool:
     common_state = flatten(common_state)
     for key, value in common_state.items():
         if key.startswith("optimizer.optimizer.param_groups."):
-            key = key.replace(
-                "optimizer.optimizer.param_groups.", "optimizer.param_groups."
-            )
-        assert key not in fsdp_dtensor_state_dict, (
-            f"Key '{key}' already exists in fsdp_dtensor_state_dict."
-        )
+            key = key.replace("optimizer.optimizer.param_groups.", "optimizer.param_groups.")
+        assert (
+            key not in fsdp_dtensor_state_dict
+        ), f"Key '{key}' already exists in fsdp_dtensor_state_dict."
         fsdp_dtensor_state_dict[key] = value
 
     # set up per-parameter param_groups
@@ -764,11 +730,13 @@ def has_layer_index(key: str) -> bool:
 
             assert name in param_to_param_group_map, f"Missing param group for {name}"
             param_group_id = param_to_param_group_map[name]
-            assert param_group_id < len(ckpt_param_groups), f"Invalid param group id {param_group_id} for {name}"
-            name_without_prefix = name[len(model_weight_prefix):]
-            fsdp_dtensor_state_dict[
-                f"{optimizer_param_to_group_prefix}.{name_without_prefix}"
-            ] = ckpt_param_groups[param_group_id]
+            assert param_group_id < len(
+                ckpt_param_groups
+            ), f"Invalid param group id {param_group_id} for {name}"
+            name_without_prefix = name[len(model_weight_prefix) :]
+            fsdp_dtensor_state_dict[f"{optimizer_param_to_group_prefix}.{name_without_prefix}"] = (
+                ckpt_param_groups[param_group_id]
+            )
 
     if "checkpoint_version" not in fsdp_dtensor_state_dict:
         fsdp_dtensor_state_dict["checkpoint_version"] = 3.0
@@ -776,7 +744,7 @@ def has_layer_index(key: str) -> bool:
     # Save modified checkpoint
     save_checkpoint_with_pickle_protocol(fsdp_dtensor_state_dict, output_dir)
 
-    dist.barrier()              # Synchronize all ranks
+    dist.barrier()  # Synchronize all ranks
     dist.destroy_process_group()
 
 
@@ -787,20 +755,18 @@ def has_layer_index(key: str) -> bool:
     "--swiglu",
     is_flag=True,
     help="Split SWiGLU fc1 weights/biases into _w and _v for ALL modules. "
-         "Use --swiglu-modules instead if only some modules use SWiGLU (e.g. VLMs). "
-         "NOT auto-detected; must be specified manually.",
+    "Use --swiglu-modules instead if only some modules use SWiGLU (e.g. VLMs). "
+    "NOT auto-detected; must be specified manually.",
 )
 @click.option(
     "--swiglu-modules",
     type=str,
     default=None,
     help="Comma-separated module names that use SWiGLU (e.g. 'language_model'). "
-         "Only these modules will have fc1 split into _w/_v. Overrides --swiglu. "
-         "NOT auto-detected; must be specified manually.",
-)
-@click.option(
-    "--oom-traceback", is_flag=True, help="Enable OOM traceback for debugging."
+    "Only these modules will have fc1 split into _w/_v. Overrides --swiglu. "
+    "NOT auto-detected; must be specified manually.",
 )
+@click.option("--oom-traceback", is_flag=True, help="Enable OOM traceback for debugging.")
 @click.option("--enable-msc", is_flag=True, help="Enable MultiStorageClient feature.")
 @click.option(
     "--output-optimizer-state-prefix",
@@ -816,15 +782,15 @@ def has_layer_index(key: str) -> bool:
     "--param-to-param-group-map-json",
     type=str,
     default="{}",
-    help="JSON string representing the param to parameter group map."
+    help="JSON string representing the param to parameter group map.",
 )
 @click.option(
     "--rename-mtp-keys",
     is_flag=True,
     help="Rename MTP layer keys from 'transformer_layer' to 'mtp_model_layer' "
-         "to match the FSDP model's state_dict naming. "
-         "Auto-detected if not set: enabled when '.mtp.layers.*.transformer_layer' "
-         "keys are found in the checkpoint.",
+    "to match the FSDP model's state_dict naming. "
+    "Auto-detected if not set: enabled when '.mtp.layers.*.transformer_layer' "
+    "keys are found in the checkpoint.",
 )
 def convert_torch_dist_to_fsdp_dtensor(
     input_dir,
@@ -908,14 +874,10 @@ def oom_observer(device, alloc, device_alloc, device_free):
             snapshot = torch.cuda.memory._snapshot()
             from pickle import dump
 
-            dump(
-                snapshot,
-                open(f"oom_rank-{torch.distributed.get_rank()}_snapshot.pickle", "wb"),
-            )
+            dump(snapshot, open(f"oom_rank-{torch.distributed.get_rank()}_snapshot.pickle", "wb"))
 
         torch._C._cuda_attach_out_of_memory_observer(oom_observer)
 
-
     # Initialize distributed process group
     init_process_group(f"convert_torch_dist_to_fsdp_dtensor from {input_dir} to {output_dir}")
 
@@ -925,10 +887,14 @@ def oom_observer(device, alloc, device_alloc, device_free):
         param_to_param_group_map = json.load(f)
     _swiglu_modules = (
         [m.strip() for m in swiglu_modules.split(",") if m.strip()]
-        if swiglu_modules is not None else None
+        if swiglu_modules is not None
+        else None
     )
     convert_checkpoint(
-        ckpt_path, output_dir, swiglu, process_group=dist.group.WORLD,
+        ckpt_path,
+        output_dir,
+        swiglu,
+        process_group=dist.group.WORLD,
         optimizer_state_prefix=output_optimizer_state_prefix,
         model_weight_prefix=output_model_weight_prefix,
         param_to_param_group_map=param_to_param_group_map,
@@ -936,11 +902,7 @@ def oom_observer(device, alloc, device_alloc, device_free):
         swiglu_modules=_swiglu_modules,
     )
 
-    click.echo(
-        click.style(
-            f"Converted checkpoint saved to {output_dir}.", fg="green", bold=True
-        )
-    )
+    click.echo(click.style(f"Converted checkpoint saved to {output_dir}.", fg="green", bold=True))
 
 
 def _modify_state_dict(input_dir, output_dir, ops, process_group, enable_msc=False):
@@ -951,10 +913,14 @@ def _modify_state_dict(input_dir, output_dir, ops, process_group, enable_msc=Fal
         assert isinstance(op, str), f"Operation '{op}' must be a string."
         op_items = op.split()
         if op_items[0] == "remove":
-            assert len(op_items) == 2, f"Remove operation requires exactly one argument: {op_items[1]}"
+            assert (
+                len(op_items) == 2
+            ), f"Remove operation requires exactly one argument: {op_items[1]}"
             remove_items.append(op_items[1])
         elif op_items[0] == "rename":
-            assert len(op_items) == 3, f"Rename operation requires exactly two arguments: {op_items[1]} {op_items[2]}"
+            assert (
+                len(op_items) == 3
+            ), f"Rename operation requires exactly two arguments: {op_items[1]} {op_items[2]}"
             rename_items.append((op_items[1], op_items[2]))
         else:
             raise NotImplementedError(f"Unsupported operation: {op} | {op_items}")
@@ -966,9 +932,7 @@ def _modify_state_dict(input_dir, output_dir, ops, process_group, enable_msc=Fal
     for key, md in metadata.state_dict_metadata.items():
         if re.search(combined_remove_items, key):
             if torch.distributed.get_rank() == 0:
-                click.echo(
-                    click.style(f"Removing key '{key}' from state_dict.", fg="yellow")
-                )
+                click.echo(click.style(f"Removing key '{key}' from state_dict.", fg="yellow"))
             if hasattr(metadata, "mcore_data") and key in metadata.mcore_data:
                 del metadata.mcore_data[key]
             continue
@@ -992,10 +956,7 @@ def _modify_state_dict(input_dir, output_dir, ops, process_group, enable_msc=Fal
             state_dict[key] = torch.distributed.tensor.empty(
                 md.size,
                 dtype=md.properties.dtype,
-                device_mesh=DeviceMesh.from_group(
-                    group=process_group,
-                    device_type="cuda",
-                ),
+                device_mesh=DeviceMesh.from_group(group=process_group, device_type="cuda"),
                 placements=[Shard(0)],
             )
         elif isinstance(md, BytesStorageMetadata):
@@ -1004,15 +965,9 @@ def _modify_state_dict(input_dir, output_dir, ops, process_group, enable_msc=Fal
             raise NotImplementedError(f"Unsupported metadata type: {type(md)}")
 
     # Save the modified state dict
-    click.echo(
-        click.style(
-            f"Saving modified state_dict to {output_dir}.", fg="green", bold=True
-        )
-    )
+    click.echo(click.style(f"Saving modified state_dict to {output_dir}.", fg="green", bold=True))
     save_checkpoint_with_pickle_protocol(
-        state_dict,
-        output_dir,
-        pickle_protocol=4,  # Use protocol 4 for OOM issue
+        state_dict, output_dir, pickle_protocol=4  # Use protocol 4 for OOM issue
     )
 
     # Copy metadata.json, common.pt
@@ -1034,17 +989,11 @@ def modify_state_dict(input_dir, output_dir, op, enable_msc):
         MultiStorageClientFeature.disable()
 
     _modify_state_dict(
-        Path(input_dir),
-        Path(output_dir),
-        op,
-        process_group=dist.group.WORLD,
-        enable_msc=enable_msc,
+        Path(input_dir), Path(output_dir), op, process_group=dist.group.WORLD, enable_msc=enable_msc
     )
 
     click.echo(
-        click.style(
-            f"State dict items modified and saved to {output_dir}.", fg="green", bold=True
-        )
+        click.style(f"State dict items modified and saved to {output_dir}.", fg="green", bold=True)
     )
 
 
@@ -1083,7 +1032,11 @@ def _compare_two_checkpoint(checkpoint_1, checkpoint_2):
             continue
 
         if meta_1.size != meta_2.size or meta_1.properties.dtype != meta_2.properties.dtype:
-            click.echo(click.style(f" - {key} (metadata differ) meta_1: {meta_1}, meta_2: {meta_2}", fg="red"))
+            click.echo(
+                click.style(
+                    f" - {key} (metadata differ) meta_1: {meta_1}, meta_2: {meta_2}", fg="red"
+                )
+            )
         else:
             value_1 = torch.empty(meta_1.size, dtype=meta_1.properties.dtype)
             value_2 = value_1.clone()
@@ -1091,10 +1044,12 @@ def _compare_two_checkpoint(checkpoint_1, checkpoint_2):
             dcp.load({key: value_1}, storage_reader=reader_1, planner=DefaultLoadPlanner())
             dcp.load({key: value_2}, storage_reader=reader_2, planner=DefaultLoadPlanner())
 
-            if not torch.allclose(
-                value_1, value_2, atol=1e-8, rtol=1e-5
-            ):
-                click.echo(click.style(f" - {key} (values differ) value_1: {value_1}, value_2: {value_2}", fg="red"))
+            if not torch.allclose(value_1, value_2, atol=1e-8, rtol=1e-5):
+                click.echo(
+                    click.style(
+                        f" - {key} (values differ) value_1: {value_1}, value_2: {value_2}", fg="red"
+                    )
+                )
 
 
 @cli.command()
@@ -1110,14 +1065,13 @@ def compare_two_checkpoint(checkpoint_1, checkpoint_2, enable_msc):
     if not enable_msc:
         MultiStorageClientFeature.disable()
 
-    _compare_two_checkpoint(
-        Path(checkpoint_1),
-        Path(checkpoint_2),
-    )
+    _compare_two_checkpoint(Path(checkpoint_1), Path(checkpoint_2))
 
     click.echo(
         click.style(
-            f"Comparison between {checkpoint_1} and {checkpoint_2} completed.", fg="green", bold=True
+            f"Comparison between {checkpoint_1} and {checkpoint_2} completed.",
+            fg="green",
+            bold=True,
         )
     )
 
@@ -1140,9 +1094,11 @@ def print_torch_dcp_in_json(torch_dcp_dir, model_weight_prefix="model.module"):
         for key, value in state_dict.items():
             new_key = key.replace("module.module", model_weight_prefix)
             new_state_dict[new_key] = value
-        
+
         # Convert state dict to JSON-serializable format
-        serializable_dict = {k: v.tolist() if hasattr(v, "tolist") else v for k, v in new_state_dict.items()}
+        serializable_dict = {
+            k: v.tolist() if hasattr(v, "tolist") else v for k, v in new_state_dict.items()
+        }
 
         # Save to a JSON file
         json_file_path = os.path.join(torch_dcp_dir, "param_to_param_group_map.json")
@@ -1158,11 +1114,7 @@ def init_process_group(message):
     time.sleep(rank * 0.01)  # Ensure all ranks are synchronized before loading
     click.echo(f"[{rank}/{world_size}] [cuda:{local_rank}] {message}")
     torch.cuda.set_device(local_rank)
-    dist.init_process_group(
-        backend="nccl",
-        rank=rank,
-        world_size=world_size,
-    )
+    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
 
 
 if __name__ == "__main__":
diff --git a/tools/checkpoint/loader_base.py b/tools/checkpoint/loader_base.py
index e12e9545b87..11e9224f726 100644
--- a/tools/checkpoint/loader_base.py
+++ b/tools/checkpoint/loader_base.py
@@ -3,10 +3,11 @@
 import os
 import sys
 import types
-import torch
 
+import torch
 from utils import _ConverterFakeProcessGroup, print_memory_usage
 
+
 class MegatronCheckpointLoaderBase:
     """Orchestrates loading a Megatron checkpoint and sending
     model parameters over a given multiprocessing queue.
@@ -20,10 +21,10 @@ def __init__(self, args, queue, build_tokenizer=False):
         self.args = args
         self.queue = queue
         self.build_tokenizer = build_tokenizer
-        self.margs = None            # Will hold Megatron's main args
+        self.margs = None  # Will hold Megatron's main args
         self.checkpoint_args = None  # Will hold additional checkpoint args
-        self.all_models = None       # Model sharded over different parallelism
-        self.md = None               # Metadata sent to the saver
+        self.all_models = None  # Model sharded over different parallelism
+        self.md = None  # Metadata sent to the saver
         self.consumed_train_samples = None
         self.consumed_valid_samples = None
 
@@ -68,12 +69,13 @@ def parse_megatron_args(self):
         # Expert parallelism requires sequence parallelism
         if margs.expert_model_parallel_size > 1:
             margs.sequence_parallel = True
-        
+
         margs = self._maybe_parse_additional_megatron_args(margs, checkpoint_args)
 
         # Validate final arguments
         try:
             from megatron.training.arguments import validate_args
+
             margs = validate_args(margs)
         except Exception as e:
             print(f"Error validating Megatron arguments: {e}")
@@ -133,8 +135,8 @@ def initialize_megatron_env(self):
         Initialize Megatron global variables and fused kernels.
         """
         try:
-            from megatron.training.global_vars import set_global_variables
             from megatron.core import mpu
+            from megatron.training.global_vars import set_global_variables
         except ModuleNotFoundError as e:
             print(f"Unable to import required Megatron modules: {e}")
             self.queue.put("exit")
@@ -143,9 +145,11 @@ def initialize_megatron_env(self):
         set_global_variables(self.margs, build_tokenizer=self.build_tokenizer)
         mpu.set_tensor_model_parallel_world_size(self.margs.tensor_model_parallel_size)
         mpu.set_pipeline_model_parallel_world_size(self.margs.pipeline_model_parallel_size)
-        mpu.set_virtual_pipeline_model_parallel_world_size(self.margs.virtual_pipeline_model_parallel_size)
+        mpu.set_virtual_pipeline_model_parallel_world_size(
+            self.margs.virtual_pipeline_model_parallel_size
+        )
         mpu.set_expert_model_parallel_world_size(self.margs.expert_model_parallel_size)
-        
+
         # For backward compatibility during local parallel states refactoring
         fake_tp_group = _ConverterFakeProcessGroup(size=self.margs.tensor_model_parallel_size)
         fake_ep_group = _ConverterFakeProcessGroup(size=self.margs.expert_model_parallel_size)
@@ -170,7 +174,9 @@ def verify_vocabs_match(self, true_vocab_size):
         if self.args.true_vocab_size is not None and self.args.vocab_file is not None:
             vocab = json.load(open(self.args.vocab_file))
             if len(vocab) != self.args.true_vocab_size:
-                print("Both --true-vocab-size and --vocab-file specified but vocab sizes do not match. Aborting.")
+                print(
+                    "Both --true-vocab-size and --vocab-file specified but vocab sizes do not match. Aborting."
+                )
                 return False
         return True
 
@@ -203,8 +209,9 @@ def get_models_for_pipeline_stage(count, dtype):
                     mpu.set_virtual_pipeline_model_parallel_rank(i)
                     pre_process = mpu.is_pipeline_first_stage()
                     post_process = mpu.is_pipeline_last_stage()
-                    this_model = model_provider(pre_process=pre_process,
-                                                post_process=post_process).to(dtype)
+                    this_model = model_provider(
+                        pre_process=pre_process, post_process=post_process
+                    ).to(dtype)
                     model_list.append(this_model)
 
                 # Each time we load, we set counters to 0, pass None for optimizer/ LR
@@ -240,7 +247,7 @@ def get_models_for_pipeline_stage(count, dtype):
             all_models.append(get_models_for_pipeline_stage(tp_size, dtype))
 
         return all_models, consumed_train_samples, consumed_valid_samples
-    
+
     def send_metadata_over_queue(self):
         # Let the consumer know the overall metadata:
         self.md.consumed_train_samples = self.consumed_train_samples
@@ -267,9 +274,7 @@ def send_llm_over_queue(self, schema):
 
         # 1) Embeddings
         embeddings = [schema.get("embeddings", m) for m in first_pipeline_models]
-        message = {
-            "word embeddings": torch.cat([e["word"] for e in embeddings], dim=0)
-        }
+        message = {"word embeddings": torch.cat([e["word"] for e in embeddings], dim=0)}
         if self.md.position_embedding_type == 'learned_absolute':
             # Only send one set from rank 0
             message["position embeddings"] = embeddings[0]["pos"]
@@ -351,19 +356,14 @@ def send_llm_over_queue(self, schema):
         # 4) Output layer
         if self.md.output_layer:
             output_layers = [schema.get("output_layer", m) for m in models]
-            message = {
-                "weight": torch.cat([layer["weight"] for layer in output_layers], dim=0),
-            }
+            message = {"weight": torch.cat([layer["weight"] for layer in output_layers], dim=0)}
             self.queue_put("output layer", message)
 
         # 5) BERT-specific parameters
         if self.md.model_type == 'BERT':
             # Pooler
             pooler = schema.get("pooler", models[0])
-            message = {
-                "weight": pooler["weight"],
-                "bias": pooler["bias"],
-            }
+            message = {"weight": pooler["weight"], "bias": pooler["bias"]}
             self.queue_put("pooler", message)
 
             # LM head
@@ -380,10 +380,7 @@ def send_llm_over_queue(self, schema):
             # Binary head
             if self.md.bert_binary_head:
                 binary_head = schema.get("binary_head", models[0])
-                message = {
-                    "weight": binary_head["weight"],
-                    "bias": binary_head["bias"],
-                }
+                message = {"weight": binary_head["weight"], "bias": binary_head["bias"]}
                 self.queue_put("binary head", message)
 
         # Done
@@ -415,12 +412,11 @@ def load(self):
         self.md = self.build_checkpoint_metadata(true_vocab_size)
 
         # 7) Load all model shards
-        self.all_models, self.consumed_train_samples, self.consumed_valid_samples = self.load_model_shards(
-            model_provider,
-            self.md.params_dtype
+        self.all_models, self.consumed_train_samples, self.consumed_valid_samples = (
+            self.load_model_shards(model_provider, self.md.params_dtype)
         )
 
-        # 8) Send model over the queue        
+        # 8) Send model over the queue
         self.send_model_over_queue()
 
     def build_checkpoint_metadata(self, true_vocab_size):
@@ -430,7 +426,7 @@ def build_checkpoint_metadata(self, true_vocab_size):
         norm_has_bias = True
         if hasattr(self.checkpoint_args, 'normalization'):
             # For older models, normalization was always "LayerNorm".
-            norm_has_bias = (self.checkpoint_args.normalization == "LayerNorm")
+            norm_has_bias = self.checkpoint_args.normalization == "LayerNorm"
 
         md = types.SimpleNamespace()
         md.model_type = self.args.model_type
@@ -469,14 +465,16 @@ def build_sys_argv(self):
             '--no-bias-gelu-fusion',
             '--no-bias-dropout-fusion',
             '--use-cpu-initialization',
-            '--micro-batch-size', '1',
+            '--micro-batch-size',
+            '1',
             '--no-load-optim',
             '--no-load-rng',
             '--no-save-optim',
             '--no-save-rng',
             '--no-initialization',
             '--mock-data',  # To pass the "blend data checks" in arguments.py
-            '--load', self.args.load_dir,
+            '--load',
+            self.args.load_dir,
             '--exit-on-missing-checkpoint',
             '--use-mp-args-from-checkpoint-args',
             '--no-one-logger',
@@ -489,4 +487,3 @@ def import_model_provider(self):
     def send_model_over_queue(self):
         """Creates model schema and sends the model over the queue"""
         raise NotImplementedError
-
diff --git a/tools/checkpoint/loader_legacy.py b/tools/checkpoint/loader_legacy.py
index 31f4e2aeb09..0dffa4efff8 100644
--- a/tools/checkpoint/loader_legacy.py
+++ b/tools/checkpoint/loader_legacy.py
@@ -4,7 +4,6 @@
 import os
 import sys
 import types
-
 from functools import partial
 
 import torch
@@ -15,62 +14,81 @@
 def add_arguments(parser):
     group = parser.add_argument_group(title='Megatron loader')
 
-    group.add_argument('--true-vocab-size', type=int, default=None,
-                       help='original size of vocab, if specified will trim padding from embedding table.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file. If specified will use this to get vocab size and '
-                       'trim padding from the embedding table.')
-    group.add_argument('--megatron-path', type=str, default=None,
-                       help='Base directory of Megatron repository')
-    group.add_argument('--position-embedding-type',
-                       type=str,
-                       default='learned_absolute',
-                       choices=['learned_absolute', 'rope'],
-                       help='Position embedding type.')
-    group.add_argument('--loader-transformer-impl', default='local',
-                       choices=['local', 'transformer_engine'],
-                       help='Which Transformer implementation to use.')
+    group.add_argument(
+        '--true-vocab-size',
+        type=int,
+        default=None,
+        help='original size of vocab, if specified will trim padding from embedding table.',
+    )
+    group.add_argument(
+        '--vocab-file',
+        type=str,
+        default=None,
+        help='Path to the vocab file. If specified will use this to get vocab size and '
+        'trim padding from the embedding table.',
+    )
+    group.add_argument(
+        '--megatron-path', type=str, default=None, help='Base directory of Megatron repository'
+    )
+    group.add_argument(
+        '--position-embedding-type',
+        type=str,
+        default='learned_absolute',
+        choices=['learned_absolute', 'rope'],
+        help='Position embedding type.',
+    )
+    group.add_argument(
+        '--loader-transformer-impl',
+        default='local',
+        choices=['local', 'transformer_engine'],
+        help='Which Transformer implementation to use.',
+    )
+
 
 def _load_checkpoint(queue, args):
 
     # Search in directory above this
-    sys.path.append(os.path.abspath(
-        os.path.join(os.path.dirname(__file__),
-                     os.path.pardir)))
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
     if args.megatron_path is not None:
         sys.path.insert(0, args.megatron_path)
 
     try:
-        from megatron.training.arguments import parse_args, validate_args
-        from megatron.training.global_vars import set_args, set_global_variables
-        from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint
-        from megatron.legacy.model import module
         from megatron.core import mpu
         from megatron.core.enums import ModelType
+        from megatron.legacy.model import module
+        from megatron.training.arguments import parse_args, validate_args
+        from megatron.training.checkpointing import load_args_from_checkpoint, load_checkpoint
+        from megatron.training.global_vars import set_args, set_global_variables
     except ModuleNotFoundError:
-        print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
+        print(
+            "Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting."
+        )
         queue.put("exit")
         exit(1)
 
     # We want all arguments to come from us
-    sys.argv = ['script.py',
-                '--no-masked-softmax-fusion',
-                '--no-bias-gelu-fusion',
-                '--no-bias-dropout-fusion',
-                '--use-cpu-initialization',
-                '--micro-batch-size', '1',
-                '--no-load-optim',
-                '--no-load-rng',
-                '--no-save-optim',
-                '--no-save-rng',
-                '--mock-data', # To pass the "blend data checks" in arguments.py
-                '--no-initialization',
-                '--load', args.load_dir,
-                '--position-embedding-type', args.position_embedding_type,
-                '--exit-on-missing-checkpoint',
-                '--use-mp-args-from-checkpoint-args',
-                '--no-one-logger',
-                ]
+    sys.argv = [
+        'script.py',
+        '--no-masked-softmax-fusion',
+        '--no-bias-gelu-fusion',
+        '--no-bias-dropout-fusion',
+        '--use-cpu-initialization',
+        '--micro-batch-size',
+        '1',
+        '--no-load-optim',
+        '--no-load-rng',
+        '--no-save-optim',
+        '--no-save-rng',
+        '--mock-data',  # To pass the "blend data checks" in arguments.py
+        '--no-initialization',
+        '--load',
+        args.load_dir,
+        '--position-embedding-type',
+        args.position_embedding_type,
+        '--exit-on-missing-checkpoint',
+        '--use-mp-args-from-checkpoint-args',
+        '--no-one-logger',
+    ]
 
     margs = parse_args()
     margs, checkpoint_args = load_args_from_checkpoint(margs)
@@ -116,12 +134,14 @@ def check_for_arg(arg_name, default=None):
 
     # Determine how to make our models
     if args.model_type == 'GPT':
-        from model_provider import model_provider as common_model_provider
         from gpt_builders import gpt_builder
+        from model_provider import model_provider as common_model_provider
+
         model_provider = partial(common_model_provider, gpt_builder)
         margs.model_type = ModelType.encoder_or_decoder
     elif args.model_type == 'BERT':
         from pretrain_bert import model_provider
+
         margs.model_type = ModelType.encoder_or_decoder
     else:
         raise Exception(f'unrecognized model type: {args.model_type}')
@@ -131,6 +151,7 @@ def check_for_arg(arg_name, default=None):
 
     consumed_train_samples = None
     consumed_valid_samples = None
+
     def get_models(count, dtype):
         nonlocal consumed_train_samples
         nonlocal consumed_valid_samples
@@ -152,8 +173,7 @@ def get_models(count, dtype):
                     pre_process = mpu.is_pipeline_first_stage()
                     post_process = mpu.is_pipeline_last_stage()
                     this_model = model_provider(
-                        pre_process=pre_process,
-                        post_process=post_process
+                        pre_process=pre_process, post_process=post_process
                     ).to(dtype)
                     model_.append(this_model)
             else:
@@ -167,11 +187,11 @@ def get_models(count, dtype):
             load_checkpoint(model_, None, None)
 
             if consumed_train_samples is not None:
-                assert(margs.consumed_train_samples == consumed_train_samples)
+                assert margs.consumed_train_samples == consumed_train_samples
             else:
                 consumed_train_samples = margs.consumed_train_samples
             if consumed_valid_samples is not None:
-                assert(margs.consumed_valid_samples == consumed_valid_samples)
+                assert margs.consumed_valid_samples == consumed_valid_samples
             else:
                 consumed_valid_samples = margs.consumed_valid_samples
             for vp_rank in range(model_array_len):
@@ -182,7 +202,7 @@ def get_models(count, dtype):
     mpu.set_tensor_model_parallel_world_size(margs.tensor_model_parallel_size)
     mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
     mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
-    
+
     # For backward compatibility during local parallel states refactoring
     fake_tp_group = _ConverterFakeProcessGroup(size=margs.tensor_model_parallel_size)
     mpu._TENSOR_MODEL_PARALLEL_GROUP = fake_tp_group
@@ -194,7 +214,9 @@ def get_models(count, dtype):
         vocab = json.load(open(args.vocab_file))
         true_vocab_size = len(vocab)
         if args.true_vocab_size is not None and true_vocab_size != args.true_vocab_size:
-            print("Both --true-vocab-size and --vocab-file specified and the vocab size does not match, aborting.")
+            print(
+                "Both --true-vocab-size and --vocab-file specified and the vocab size does not match, aborting."
+            )
             queue.put("exit")
             exit(1)
     else:
@@ -255,11 +277,17 @@ def queue_put(name, msg):
     # Send embeddings
     message = {
         "word embeddings": torch.cat(
-            [models[tp_rank].language_model.embedding.word_embeddings.weight.data for tp_rank in range(tp_size)],
-            dim = 0)
+            [
+                models[tp_rank].language_model.embedding.word_embeddings.weight.data
+                for tp_rank in range(tp_size)
+            ],
+            dim=0,
+        )
     }
     if md.position_embedding_type == 'learned_absolute':
-        message["position embeddings"] = models[0].language_model.embedding.position_embeddings.weight.data
+        message["position embeddings"] = models[
+            0
+        ].language_model.embedding.position_embeddings.weight.data
     else:
         assert not hasattr(models[0].language_model.embedding, 'position_embeddings')
 
@@ -327,8 +355,8 @@ def queue_put(name, msg):
                     if md.swiglu:
                         for tp_rank in range(tp_size):
                             mlp_l0_bias[tp_rank] = torch.chunk(mlp_l0_bias[tp_rank], 2, dim=0)
-                        message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias],dim=0)
-                        message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias],dim=0)
+                        message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias], dim=0)
+                        message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias], dim=0)
                     else:
                         message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0)
 
@@ -337,9 +365,7 @@ def queue_put(name, msg):
                 total_layer_num = total_layer_num + 1
 
     # Send final norm from tp_rank 0
-    message = {
-        "weight": models[0].language_model.encoder.final_norm.weight.data,
-    }
+    message = {"weight": models[0].language_model.encoder.final_norm.weight.data}
     if norm_has_bias:
         message["bias"] = models[0].language_model.encoder.final_norm.bias.data
     queue_put("final norm", message)
@@ -347,17 +373,20 @@ def queue_put(name, msg):
     if md.output_layer:
         message = {
             "weight": torch.cat(
-                [models[tp_rank].language_model.output_layer.weight.data for tp_rank in range(tp_size)],
-                dim = 0)
+                [
+                    models[tp_rank].language_model.output_layer.weight.data
+                    for tp_rank in range(tp_size)
+                ],
+                dim=0,
+            )
         }
         queue_put("output layer", message)
 
-
     # Send BERT lm head and binary head if it exists
     if md.model_type == 'BERT':
         message = {
             "weight": models[0].language_model.pooler.dense.weight.data,
-            "bias": models[0].language_model.pooler.dense.bias.data
+            "bias": models[0].language_model.pooler.dense.bias.data,
         }
         queue_put("pooler", message)
 
@@ -373,11 +402,12 @@ def queue_put(name, msg):
         if md.bert_binary_head:
             message = {
                 "weight": models[0].binary_head.weight.data,
-                "bias": models[0].binary_head.bias.data
+                "bias": models[0].binary_head.bias.data,
             }
             queue_put("binary head", message)
     queue.put("done")
 
+
 def load_checkpoint(queue, args):
     try:
         _load_checkpoint(queue, args)
diff --git a/tools/checkpoint/loader_llama_mistral.py b/tools/checkpoint/loader_llama_mistral.py
index 994abd056bf..45285bbf29c 100644
--- a/tools/checkpoint/loader_llama_mistral.py
+++ b/tools/checkpoint/loader_llama_mistral.py
@@ -3,44 +3,81 @@
 import json
 import os
 import sys
-import torch
 
+import torch
 from utils import _ConverterFakeProcessGroup
+
 try:
     import transformers
 except ImportError:
     raise ImportError("The 'transformers' package is not installed.")
 import gc
 import shutil
-from tqdm import tqdm
 import types
 
+from tqdm import tqdm
+
 
 def add_arguments(parser):
     group = parser.add_argument_group(title='Llama/Mistral loader.')
 
     # TODO(jbarker): Need assertion to make sure *exactly* one of these is used
-    parser.add_argument('--model-size', type=str, required=True,
-                        choices=['llama2-7B', 'llama2-13B', 'llama2-70B', 'llama2-7Bf', 'llama2-13Bf', 'llama2-70Bf', 'llama3', 'mistral', 'yi-34B', 'qwen2.5'],
-                        help='Select model size/type')
-    parser.add_argument('--checkpoint-type', type=str, required=True,
-                        choices=['meta', 'hf'],
-                        help='Type of checkpoint to convert, options are "meta" or "hf"')
+    parser.add_argument(
+        '--model-size',
+        type=str,
+        required=True,
+        choices=[
+            'llama2-7B',
+            'llama2-13B',
+            'llama2-70B',
+            'llama2-7Bf',
+            'llama2-13Bf',
+            'llama2-70Bf',
+            'llama3',
+            'mistral',
+            'yi-34B',
+            'qwen2.5',
+        ],
+        help='Select model size/type',
+    )
+    parser.add_argument(
+        '--checkpoint-type',
+        type=str,
+        required=True,
+        choices=['meta', 'hf'],
+        help='Type of checkpoint to convert, options are "meta" or "hf"',
+    )
     parser.add_argument('--bf16', action='store_true', help='Whether to load weights in bf16.')
     parser.add_argument('--fp16', action='store_true', help='Whether to load weights in fp16.')
-    group.add_argument('--true-vocab-size', type=int, default=None,
-                       help='original size of vocab, if specified will trim padding from embedding table.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file. If specified will use this to get vocab size and '
-                       'trim padding from the embedding table.')
-    group.add_argument('--tokenizer-model', required=True,
-                       help='Tokenizer model file.')
-    group.add_argument('--megatron-path', type=str, default=None,
-                       help='Base directory of Megatron repository')
-    group.add_argument("--make-vocab-size-divisible-by", type=int, default=None, help="Make vocab size divisible by")
-    group.add_argument('--loader-transformer-impl', default='local',
-                       choices=['local', 'transformer_engine'],
-                       help='Which Transformer implementation to use.')
+    group.add_argument(
+        '--true-vocab-size',
+        type=int,
+        default=None,
+        help='original size of vocab, if specified will trim padding from embedding table.',
+    )
+    group.add_argument(
+        '--vocab-file',
+        type=str,
+        default=None,
+        help='Path to the vocab file. If specified will use this to get vocab size and '
+        'trim padding from the embedding table.',
+    )
+    group.add_argument('--tokenizer-model', required=True, help='Tokenizer model file.')
+    group.add_argument(
+        '--megatron-path', type=str, default=None, help='Base directory of Megatron repository'
+    )
+    group.add_argument(
+        "--make-vocab-size-divisible-by",
+        type=int,
+        default=None,
+        help="Make vocab size divisible by",
+    )
+    group.add_argument(
+        '--loader-transformer-impl',
+        default='local',
+        choices=['local', 'transformer_engine'],
+        help='Which Transformer implementation to use.',
+    )
 
 
 def verify_transformers_version():
@@ -59,7 +96,9 @@ def verify_transformers_version():
 
 
 def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+    return multiple_of * (
+        (int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of
+    )
 
 
 def read_json(path):
@@ -77,9 +116,11 @@ def write_json(text, path):
 def convert_to_hf(model_path, input_base_path, model_size, tokenizer_path):
     if "llama2" in model_size:
         from transformers import LlamaConfig as ModelConfig
-        from transformers import  LlamaTokenizer, LlamaTokenizerFast
+        from transformers import LlamaTokenizer, LlamaTokenizerFast
     else:
-        raise NotImplementedError(f"converting {model_size} is only supported using HuggingFace weights")
+        raise NotImplementedError(
+            f"converting {model_size} is only supported using HuggingFace weights"
+        )
 
     # for backward compatibility, before you needed the repo to be called `my_repo/model_size`
     if not os.path.isfile(os.path.join(input_base_path, "params.json")):
@@ -133,11 +174,15 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
     if num_shards == 1:
         # Not sharded
         # (The sharded implementation would also work, but this is simpler.)
-        loaded = torch.load(os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu")
+        loaded = torch.load(
+            os.path.join(input_base_path, "consolidated.00.pth"), map_location="cpu"
+        )
     else:
         # Sharded
         loaded = [
-            torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu")
+            torch.load(
+                os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu"
+            )
             for i in range(num_shards)
         ]
     param_count = 0
@@ -154,13 +199,27 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
             state_dict = {
                 f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj,
                 f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj,
-                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"],
-                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"],
-                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"],
-                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"],
-                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"],
-                f"model.layers.{layer_i}.input_layernorm.weight": loaded[f"layers.{layer_i}.attention_norm.weight"],
-                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[f"layers.{layer_i}.ffn_norm.weight"],
+                f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[
+                    f"layers.{layer_i}.attention.wv.weight"
+                ],
+                f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[
+                    f"layers.{layer_i}.attention.wo.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[
+                    f"layers.{layer_i}.feed_forward.w1.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[
+                    f"layers.{layer_i}.feed_forward.w2.weight"
+                ],
+                f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[
+                    f"layers.{layer_i}.feed_forward.w3.weight"
+                ],
+                f"model.layers.{layer_i}.input_layernorm.weight": loaded[
+                    f"layers.{layer_i}.attention_norm.weight"
+                ],
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ],
             }
         else:
             # Sharded
@@ -179,7 +238,9 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
             state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
                 torch.cat(
                     [
-                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim)
+                        loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(
+                            n_heads_per_shard, dims_per_head, dim
+                        )
                         for i in range(num_shards)
                     ],
                     dim=0,
@@ -210,16 +271,20 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
             ).reshape(key_value_dim, dim)
 
             state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1
+                [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)],
+                dim=1,
             )
             state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0
+                [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)],
+                dim=0,
             )
             state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1
+                [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)],
+                dim=1,
             )
             state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat(
-                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0
+                [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)],
+                dim=0,
             )
 
         state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
@@ -243,7 +308,9 @@ def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
             "model.embed_tokens.weight": torch.cat(
                 [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=d
             ),
-            "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0),
+            "lm_head.weight": torch.cat(
+                [loaded[i]["output.weight"] for i in range(num_shards)], dim=0
+            ),
         }
 
     for k, v in state_dict.items():
@@ -297,7 +364,7 @@ def load_args_from_checkpoint(args, model_size):
     args.num_layers = model_args["num_hidden_layers"]
     args.global_batch_size = 1024
     args.norm_epsilon = model_args["rms_norm_eps"]
-    args.iteration = 1 # '0', 'release' don't work
+    args.iteration = 1  # '0', 'release' don't work
     args.position_embedding_type = "rope"
     args.swiglu = True
     args.normalization = "RMSNorm"
@@ -315,7 +382,8 @@ def load_args_from_checkpoint(args, model_size):
 def set_preprocess_state(args, model, hf_model):
     '''Set embedding params.'''
     model.language_model.embedding.word_embeddings.weight.data.copy_(
-        hf_model.model.embed_tokens.weight)
+        hf_model.model.embed_tokens.weight
+    )
 
 
 def set_postprocess_state(args, model, hf_model):
@@ -335,23 +403,32 @@ def set_attn_state(args, layer, hf_layer):
     # Reshape loaded weights.
     tp = args.tensor_model_parallel_size
     nh = args.num_attention_heads // tp
-    ng = (args.num_query_groups if args.group_query_attention \
-        else args.num_attention_heads) // tp
+    ng = (args.num_query_groups if args.group_query_attention else args.num_attention_heads) // tp
     dim = args.kv_channels
     assert nh % ng == 0
 
     # Copy weights (re-order dimensions for Megatron).
-    attn.query_key_value.weight.data.copy_(torch.cat([
-        hf_attn.q_proj.weight.reshape((ng, dim*nh//ng, -1)),
-        hf_attn.k_proj.weight.reshape((ng, dim, -1)),
-        hf_attn.v_proj.weight.reshape((ng, dim, -1)),
-    ], dim=1).reshape((-1, args.hidden_size)))
+    attn.query_key_value.weight.data.copy_(
+        torch.cat(
+            [
+                hf_attn.q_proj.weight.reshape((ng, dim * nh // ng, -1)),
+                hf_attn.k_proj.weight.reshape((ng, dim, -1)),
+                hf_attn.v_proj.weight.reshape((ng, dim, -1)),
+            ],
+            dim=1,
+        ).reshape((-1, args.hidden_size))
+    )
     if args.add_qkv_bias:
-        attn.query_key_value.bias.data.copy_(torch.cat([
-            hf_attn.q_proj.bias.reshape((ng, dim*nh//ng)),
-            hf_attn.k_proj.bias.reshape((ng, dim)),
-            hf_attn.v_proj.bias.reshape((ng, dim)),
-        ], dim=1).reshape(-1))
+        attn.query_key_value.bias.data.copy_(
+            torch.cat(
+                [
+                    hf_attn.q_proj.bias.reshape((ng, dim * nh // ng)),
+                    hf_attn.k_proj.bias.reshape((ng, dim)),
+                    hf_attn.v_proj.bias.reshape((ng, dim)),
+                ],
+                dim=1,
+            ).reshape(-1)
+        )
 
     attn.dense.weight.data.copy_(hf_attn.o_proj.weight)
 
@@ -362,10 +439,9 @@ def set_mlp_state(args, layer, hf_layer):
     mlp = layer.mlp
     hf_mlp = hf_layer.mlp
 
-    mlp.dense_h_to_4h.weight.data.copy_(torch.cat([
-        hf_mlp.gate_proj.weight,
-        hf_mlp.up_proj.weight,
-    ], dim=0))
+    mlp.dense_h_to_4h.weight.data.copy_(
+        torch.cat([hf_mlp.gate_proj.weight, hf_mlp.up_proj.weight], dim=0)
+    )
     mlp.dense_4h_to_h.weight.data.copy_(hf_mlp.down_proj.weight)
 
 
@@ -384,12 +460,15 @@ def set_layer_state(args, model, hf_model, layer_idx):
 def load_checkpoint_to_model(args):
     '''Set model params.'''
 
-    from model_provider import model_provider
-    from gpt_builders import gpt_builder
     from transformers import AutoModelForCausalLM
 
+    from gpt_builders import gpt_builder
+    from model_provider import model_provider
+
     # Load Huggingface model.
-    hf_model = AutoModelForCausalLM.from_pretrained(args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu")
+    hf_model = AutoModelForCausalLM.from_pretrained(
+        args.load, torch_dtype=args.params_dtype, low_cpu_mem_usage=True, device_map="cpu"
+    )
 
     # Init Megatron model.
     model = model_provider(gpt_builder, pre_process=True, post_process=True).to(args.params_dtype)
@@ -408,46 +487,55 @@ def _load_checkpoint(queue, args):
     verify_transformers_version()
 
     # Search in directory above this.
-    sys.path.append(os.path.abspath(
-        os.path.join(os.path.dirname(__file__),
-                     os.path.pardir,
-                     os.path.pardir)))
+    sys.path.append(
+        os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+    )
     if args.megatron_path is not None:
         sys.path.insert(0, args.megatron_path)
 
     # Convert Meta checkpoint to HF format as an intermediate step
     if args.checkpoint_type == "meta":
-        model_tmp_path = convert_to_hf(model_path=os.path.join(args.save_dir, 'tmp'), input_base_path=args.load_dir, model_size=args.model_size, tokenizer_path=args.tokenizer_model)
+        model_tmp_path = convert_to_hf(
+            model_path=os.path.join(args.save_dir, 'tmp'),
+            input_base_path=args.load_dir,
+            model_size=args.model_size,
+            tokenizer_path=args.tokenizer_model,
+        )
         args.load_dir = model_tmp_path
-        args.tokenizer_model = model_tmp_path # point to HF tokenizer model
+        args.tokenizer_model = model_tmp_path  # point to HF tokenizer model
 
     try:
-        from megatron.training.arguments import parse_args, validate_args
-        from megatron.training.global_vars import set_args, set_global_variables
-        from megatron.legacy.model import module
         from megatron.core import mpu
         from megatron.core.enums import ModelType
+        from megatron.legacy.model import module
+        from megatron.training.arguments import parse_args, validate_args
+        from megatron.training.global_vars import set_args, set_global_variables
     except ModuleNotFoundError:
-        print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
+        print(
+            "Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting."
+        )
         queue.put("exit")
         exit(1)
 
     # We want all arguments to come from us.
-    sys.argv = ['script.py',
-                '--no-masked-softmax-fusion',
-                '--no-bias-gelu-fusion',
-                '--no-bias-dropout-fusion',
-                '--use-cpu-initialization',
-                '--micro-batch-size', '1',
-                '--no-load-optim',
-                '--no-load-rng',
-                '--no-save-optim',
-                '--no-save-rng',
-                '--mock-data', # To pass the "blend data checks" in arguments.py
-                '--no-initialization',
-                '--load', args.load_dir,
-                '--no-one-logger',
-                ]
+    sys.argv = [
+        'script.py',
+        '--no-masked-softmax-fusion',
+        '--no-bias-gelu-fusion',
+        '--no-bias-dropout-fusion',
+        '--use-cpu-initialization',
+        '--micro-batch-size',
+        '1',
+        '--no-load-optim',
+        '--no-load-rng',
+        '--no-save-optim',
+        '--no-save-rng',
+        '--mock-data',  # To pass the "blend data checks" in arguments.py
+        '--no-initialization',
+        '--load',
+        args.load_dir,
+        '--no-one-logger',
+    ]
 
     if args.make_vocab_size_divisible_by is not None:
         sys.argv.extend(["--make-vocab-size-divisible-by", str(args.make_vocab_size_divisible_by)])
@@ -506,7 +594,9 @@ def check_for_arg(arg_name, default=None):
     # Determine how to make our models.
     assert args.model_type == 'GPT', 'Llama-2, Llama-3 and Mistral are GPT models.'
     margs.model_type = ModelType.encoder_or_decoder
-    margs.params_dtype = torch.bfloat16 if args.bf16 else torch.float16 if args.fp16 else torch.float32
+    margs.params_dtype = (
+        torch.bfloat16 if args.bf16 else torch.float16 if args.fp16 else torch.float32
+    )
 
     # Suppress warning about torch.distributed not being initialized.
     module.MegatronModule.embedding_warning_printed = True
@@ -573,11 +663,11 @@ def queue_put(name, msg):
         queue.put(msg)
 
     # Send embeddings.
-    message = {
-        "word embeddings": model.language_model.embedding.word_embeddings.weight.data
-    }
+    message = {"word embeddings": model.language_model.embedding.word_embeddings.weight.data}
     if md.position_embedding_type == 'learned_absolute':
-        message["position embeddings"] = model.language_model.embedding.position_embeddings.weight.data
+        message["position embeddings"] = (
+            model.language_model.embedding.position_embeddings.weight.data
+        )
     else:
         assert not hasattr(model.language_model.embedding, 'position_embeddings')
 
@@ -632,23 +722,19 @@ def queue_put(name, msg):
             if md.swiglu:
                 for tp_rank in range(tp_size):
                     mlp_l0_bias[tp_rank] = torch.chunk(mlp_l0_bias[tp_rank], 2, dim=0)
-                message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias],dim=0)
-                message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias],dim=0)
+                message["mlp l0 bias W"] = torch.cat([b[0] for b in mlp_l0_bias], dim=0)
+                message["mlp l0 bias V"] = torch.cat([b[1] for b in mlp_l0_bias], dim=0)
             else:
                 message["mlp l0 bias"] = torch.cat(mlp_l0_bias, dim=0)
 
         queue_put(f"transformer layer {layer_num}", message)
 
     # Send final norm from tp_rank 0.
-    message = {
-        "weight": model.language_model.encoder.final_norm.weight.data,
-    }
+    message = {"weight": model.language_model.encoder.final_norm.weight.data}
     queue_put("final norm", message)
 
     if md.output_layer:
-        message = {
-            "weight": model.language_model.output_layer.weight.data
-        }
+        message = {"weight": model.language_model.output_layer.weight.data}
         queue_put("output layer", message)
 
     queue.put("done")
diff --git a/tools/checkpoint/loader_mixtral_hf.py b/tools/checkpoint/loader_mixtral_hf.py
index f8eb1d5e646..8c57a9737c8 100644
--- a/tools/checkpoint/loader_mixtral_hf.py
+++ b/tools/checkpoint/loader_mixtral_hf.py
@@ -3,10 +3,11 @@
 import json
 import os
 import sys
+import types
+
 import torch
 import transformers
 from tqdm import tqdm
-import types
 
 from tools.checkpoint.utils import _ConverterFakeProcessGroup
 
@@ -14,27 +15,36 @@
 def add_arguments(parser):
     group = parser.add_argument_group(title='Mixtral HF loader.')
 
-    group.add_argument('--true-vocab-size', type=int, default=None,
-                       help='original size of vocab, if specified will trim padding from embedding table.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file. If specified will use this to get vocab size and '
-                       'trim padding from the embedding table.')
-    group.add_argument('--tokenizer-model', required=True,
-                       help='Sentencepiece tokenizer model.')
-    group.add_argument('--megatron-path', type=str, default=None,
-                       help='Base directory of deepspeed repository')
+    group.add_argument(
+        '--true-vocab-size',
+        type=int,
+        default=None,
+        help='original size of vocab, if specified will trim padding from embedding table.',
+    )
+    group.add_argument(
+        '--vocab-file',
+        type=str,
+        default=None,
+        help='Path to the vocab file. If specified will use this to get vocab size and '
+        'trim padding from the embedding table.',
+    )
+    group.add_argument('--tokenizer-model', required=True, help='Sentencepiece tokenizer model.')
+    group.add_argument(
+        '--megatron-path', type=str, default=None, help='Base directory of deepspeed repository'
+    )
 
 
 def load_args_from_checkpoint(args):
     # Read Mixtral 8x7B args.
     from transformers import MixtralConfig
+
     mixtral_config = MixtralConfig.from_pretrained(args.load)
 
     # Update Megatron args.
     args.untie_embeddings_and_output_weights = True
     args.seq_length = 4096
     args.global_batch_size = 1024
-    args.iteration = 1 # '0', 'release' don't work
+    args.iteration = 1  # '0', 'release' don't work
     args.add_position_embedding = False
     args.use_rotary_position_embeddings = True
     args.swiglu = True
@@ -60,20 +70,23 @@ def load_args_from_checkpoint(args):
         args.group_query_attention = True
         args.num_query_groups = mixtral_config.num_key_value_heads
 
+
 def verify_transformers_version():
     major, minor, patch = map(int, transformers.__version__.split('.'))
     assert major >= 4 and minor >= 36
 
+
 def set_preprocess_state(args, model, hf_model):
     '''Set embedding params.'''
-    model.embedding.word_embeddings.weight.data.copy_(
-        hf_model.model.embed_tokens.weight)
+    model.embedding.word_embeddings.weight.data.copy_(hf_model.model.embed_tokens.weight)
+
 
 def set_postprocess_state(args, model, hf_model):
     '''Set output layer & norm params.'''
     model.decoder.final_layernorm.weight.data.copy_(hf_model.model.norm.weight)
     model.output_layer.weight.data.copy_(hf_model.lm_head.weight)
 
+
 def set_attn_state(args, layer, hf_layer):
     '''Set self-attention params.'''
 
@@ -84,19 +97,27 @@ def set_attn_state(args, layer, hf_layer):
     # Reshape loaded weights.
     tp = args.tensor_model_parallel_size
     num_heads = args.num_attention_heads // tp
-    num_query_groups = (args.num_query_groups if args.group_query_attention else args.num_attention_heads) // tp
+    num_query_groups = (
+        args.num_query_groups if args.group_query_attention else args.num_attention_heads
+    ) // tp
     num_querys_per_group = num_heads // num_query_groups
     dim = args.kv_channels
     assert num_heads % num_querys_per_group == 0
 
     # Copy weights (re-order dimensions for Megatron).
-    attn.linear_qkv.weight.data.copy_(torch.cat([
-        hf_attn.q_proj.weight.reshape((num_query_groups, num_querys_per_group*dim, -1)),
-        hf_attn.k_proj.weight.reshape((num_query_groups, dim, -1)),
-        hf_attn.v_proj.weight.reshape((num_query_groups, dim, -1)),
-    ], dim=1).reshape((-1, args.hidden_size)))
+    attn.linear_qkv.weight.data.copy_(
+        torch.cat(
+            [
+                hf_attn.q_proj.weight.reshape((num_query_groups, num_querys_per_group * dim, -1)),
+                hf_attn.k_proj.weight.reshape((num_query_groups, dim, -1)),
+                hf_attn.v_proj.weight.reshape((num_query_groups, dim, -1)),
+            ],
+            dim=1,
+        ).reshape((-1, args.hidden_size))
+    )
     attn.linear_proj.weight.data.copy_(hf_attn.o_proj.weight)
 
+
 def set_mlp_state(args, layer, hf_layer):
     '''Set MLP params.'''
 
@@ -106,14 +127,10 @@ def set_mlp_state(args, layer, hf_layer):
     hf_experts = hf_layer.block_sparse_moe.experts
     for expert_idx in range(args.num_experts):
         mcore_experts[expert_idx].linear_fc1.weight.data.copy_(
-            torch.cat([
-                hf_experts[expert_idx].w1.weight,
-                hf_experts[expert_idx].w3.weight
-            ], dim=0)
-        )
-        mcore_experts[expert_idx].linear_fc2.weight.data.copy_(
-            hf_experts[expert_idx].w2.weight
+            torch.cat([hf_experts[expert_idx].w1.weight, hf_experts[expert_idx].w3.weight], dim=0)
         )
+        mcore_experts[expert_idx].linear_fc2.weight.data.copy_(hf_experts[expert_idx].w2.weight)
+
 
 def set_layer_state(args, model, hf_model, layer_idx):
     '''Set transformer layer params.'''
@@ -127,12 +144,14 @@ def set_layer_state(args, model, hf_model, layer_idx):
     layer.self_attention.linear_qkv.layer_norm_weight.data.copy_(hf_layer.input_layernorm.weight)
     layer.pre_mlp_layernorm.weight.data.copy_(hf_layer.post_attention_layernorm.weight)
 
+
 def load_checkpoint_to_model(args):
     '''Set model params.'''
 
-    from model_provider import model_provider
+    from transformers import MixtralConfig, MixtralForCausalLM
+
     from gpt_builders import gpt_builder
-    from transformers import MixtralForCausalLM, MixtralConfig
+    from model_provider import model_provider
 
     # Load Huggingface model.
 
@@ -155,43 +174,48 @@ def _load_checkpoint(queue, args):
     verify_transformers_version()
 
     # Search in directory above this.
-    sys.path.append(os.path.abspath(
-        os.path.join(os.path.dirname(__file__),
-                     os.path.pardir,
-                     os.path.pardir)))
+    sys.path.append(
+        os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+    )
     if args.megatron_path is not None:
         sys.path.insert(0, args.megatron_path)
 
     try:
-        from megatron.training.arguments import parse_args, validate_args
-        from megatron.training.global_vars import set_args, set_global_variables
-        from megatron.legacy.model import module
         from megatron.core import mpu
         from megatron.core.enums import ModelType
+        from megatron.legacy.model import module
+        from megatron.training.arguments import parse_args, validate_args
+        from megatron.training.global_vars import set_args, set_global_variables
     except ModuleNotFoundError:
-        print("Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting.")
+        print(
+            "Unable to import Megatron, please specify the path to Megatron using --megatron-path. Exiting."
+        )
         queue.put("exit")
         exit(1)
 
     # We want all arguments to come from us.
-    sys.argv = ['script.py',
-                '--use-mcore-models',
-                '--disable-bias-linear',
-                '--no-masked-softmax-fusion',
-                '--no-bias-gelu-fusion',
-                '--no-bias-dropout-fusion',
-                '--use-cpu-initialization',
-                '--micro-batch-size', '1',
-                '--no-load-optim',
-                '--no-load-rng',
-                '--no-save-optim',
-                '--no-save-rng',
-                '--no-initialization',
-                '--mock-data', # To pass the "blend data checks" in arguments.py
-                '--transformer-impl', 'transformer_engine',
-                '--load', args.load_dir,
-                '--no-one-logger',
-                ]
+    sys.argv = [
+        'script.py',
+        '--use-mcore-models',
+        '--disable-bias-linear',
+        '--no-masked-softmax-fusion',
+        '--no-bias-gelu-fusion',
+        '--no-bias-dropout-fusion',
+        '--use-cpu-initialization',
+        '--micro-batch-size',
+        '1',
+        '--no-load-optim',
+        '--no-load-rng',
+        '--no-save-optim',
+        '--no-save-rng',
+        '--no-initialization',
+        '--mock-data',  # To pass the "blend data checks" in arguments.py
+        '--transformer-impl',
+        'transformer_engine',
+        '--load',
+        args.load_dir,
+        '--no-one-logger',
+    ]
 
     margs = parse_args()
     margs.tokenizer_model = args.tokenizer_model
@@ -239,7 +263,7 @@ def check_for_arg(arg_name, default=None):
     mpu.set_pipeline_model_parallel_world_size(margs.pipeline_model_parallel_size)
     mpu.set_virtual_pipeline_model_parallel_world_size(margs.virtual_pipeline_model_parallel_size)
     mpu.set_expert_model_parallel_world_size(margs.expert_model_parallel_size)
-    
+
     # For backward compatibility during local parallel states refactoring
     fake_tp_group = _ConverterFakeProcessGroup(size=margs.tensor_model_parallel_size)
     fake_ep_group = _ConverterFakeProcessGroup(size=margs.expert_model_parallel_size)
@@ -265,7 +289,7 @@ def check_for_arg(arg_name, default=None):
     md.swiglu = margs.swiglu
     md.previous_tensor_parallel_size = margs.tensor_model_parallel_size
     md.previous_pipeline_parallel_size = margs.pipeline_model_parallel_size
-    md.true_vocab_size = margs.vocab_size # skips padding in saver
+    md.true_vocab_size = margs.vocab_size  # skips padding in saver
     md.make_vocab_size_divisible_by = None
     md.checkpoint_args = margs
     md.consumed_train_samples = 0
@@ -286,9 +310,7 @@ def queue_put(name, msg):
         queue.put(msg)
 
     # Send embeddings.
-    message = {
-        "word embeddings": model.embedding.word_embeddings.weight.data
-    }
+    message = {"word embeddings": model.embedding.word_embeddings.weight.data}
     if md.position_embedding_type == 'learned_absolute':
         message["position embeddings"] = model.embedding.position_embeddings.weight.data
     else:
@@ -314,26 +336,34 @@ def queue_put(name, msg):
 
         message["router weight"] = layer.mlp.router.weight.data
         if md.swiglu:
-            chunked_mlp_l0_weight =  [torch.chunk(local_expert.linear_fc1.weight.data, 2, dim=0) for local_expert in experts]
-            message["mlp l0 weight W"] = torch.stack([local_weight[0] for local_weight in chunked_mlp_l0_weight], dim=0)
-            message["mlp l0 weight V"] = torch.stack([local_weight[1] for local_weight in chunked_mlp_l0_weight], dim=0)
+            chunked_mlp_l0_weight = [
+                torch.chunk(local_expert.linear_fc1.weight.data, 2, dim=0)
+                for local_expert in experts
+            ]
+            message["mlp l0 weight W"] = torch.stack(
+                [local_weight[0] for local_weight in chunked_mlp_l0_weight], dim=0
+            )
+            message["mlp l0 weight V"] = torch.stack(
+                [local_weight[1] for local_weight in chunked_mlp_l0_weight], dim=0
+            )
         else:
-            message["mlp l0 weight"] = torch.stack([local_expert.linear_fc1.weight.data for local_expert in experts])
-        message["mlp l1 weight"] = torch.stack([local_expert.linear_fc2.weight.data for local_expert in experts], dim=0)
+            message["mlp l0 weight"] = torch.stack(
+                [local_expert.linear_fc1.weight.data for local_expert in experts]
+            )
+        message["mlp l1 weight"] = torch.stack(
+            [local_expert.linear_fc2.weight.data for local_expert in experts], dim=0
+        )
 
         queue_put(f"transformer layer {layer_idx}", message)
 
-    queue_put("final norm", {
-        "weight": model.decoder.final_layernorm.weight.data,
-    })
+    queue_put("final norm", {"weight": model.decoder.final_layernorm.weight.data})
 
     if md.output_layer:
-        queue_put("output layer", {
-            "weight": model.output_layer.weight.data
-        })
+        queue_put("output layer", {"weight": model.output_layer.weight.data})
 
     queue.put("done")
 
+
 def load_checkpoint(queue, args):
     try:
         _load_checkpoint(queue, args)
diff --git a/tools/checkpoint/remap_gpt_dsa_to_mamba.py b/tools/checkpoint/remap_gpt_dsa_to_mamba.py
index 8a6888d1dc7..e5acbb09673 100644
--- a/tools/checkpoint/remap_gpt_dsa_to_mamba.py
+++ b/tools/checkpoint/remap_gpt_dsa_to_mamba.py
@@ -60,18 +60,20 @@ def _remap_key(key: str, num_gpt_layers: int) -> str:
 
     # Final layernorm name differs between TransformerBlock and MambaStack
     if key.startswith(final_ln_prefix):
-        return "decoder.final_norm." + key[len(final_ln_prefix):]
+        return "decoder.final_norm." + key[len(final_ln_prefix) :]
 
     if not key.startswith(layer_prefix):
         return key  # embedding, output_layer, rotary_pos_emb, etc.
 
     # Parse "decoder.layers.{N}.{rest}"
-    remainder = key[len(layer_prefix):]
+    remainder = key[len(layer_prefix) :]
     dot_idx = remainder.index('.')
     layer_n = int(remainder[:dot_idx])
-    rest = remainder[dot_idx + 1:]
+    rest = remainder[dot_idx + 1 :]
 
-    assert 0 <= layer_n < num_gpt_layers, f"Layer index {layer_n} out of range [0, {num_gpt_layers}) in key '{key}'"
+    assert (
+        0 <= layer_n < num_gpt_layers
+    ), f"Layer index {layer_n} out of range [0, {num_gpt_layers}) in key '{key}'"
 
     if rest.startswith("input_layernorm.") or rest.startswith("self_attention."):
         return f"{layer_prefix}{2 * layer_n}.{rest}"
@@ -88,9 +90,7 @@ def _remap_key(key: str, num_gpt_layers: int) -> str:
         )
 
 
-def _remap_state_dict(
-    gpt_sd: Dict, num_gpt_layers: int
-) -> Dict:
+def _remap_state_dict(gpt_sd: Dict, num_gpt_layers: int) -> Dict:
     """Apply key remapping to the full GPTModel state dict."""
     return {_remap_key(k, num_gpt_layers): v for k, v in gpt_sd.items()}
 
@@ -106,10 +106,7 @@ def convert(input_path: Path, output_path: Path, num_gpt_layers: int) -> None:
     try:
         import torch
         import torch.distributed.checkpoint as dcp
-        from torch.distributed.checkpoint.format_utils import (
-            dcp_to_torch_save,
-            torch_save_to_dcp,
-        )
+        from torch.distributed.checkpoint.format_utils import dcp_to_torch_save, torch_save_to_dcp
     except ImportError as exc:
         raise SystemExit(
             "PyTorch distributed checkpoint (torch.distributed.checkpoint) is required. "
@@ -129,9 +126,7 @@ def convert(input_path: Path, output_path: Path, num_gpt_layers: int) -> None:
 
         # --- Remap keys ---
         mamba_sd = _remap_state_dict(gpt_sd, num_gpt_layers)
-        print(
-            f"Remapped state dict: {len(gpt_sd)} GPT keys → {len(mamba_sd)} Mamba keys."
-        )
+        print(f"Remapped state dict: {len(gpt_sd)} GPT keys → {len(mamba_sd)} Mamba keys.")
 
         # --- Save remapped state dict as a new flat .pt then convert to DCP ---
         tmp_mamba = output_path.parent / "_tmp_mamba_flat.pt"
@@ -152,15 +147,21 @@ def main() -> None:
         description="Convert GPTModel DSA checkpoint to MambaModel-compatible format."
     )
     parser.add_argument(
-        "--input", required=True, type=Path,
+        "--input",
+        required=True,
+        type=Path,
         help="Path to the source GPTModel DCP checkpoint directory.",
     )
     parser.add_argument(
-        "--output", required=True, type=Path,
+        "--output",
+        required=True,
+        type=Path,
         help="Destination path for the MambaModel DCP checkpoint.",
     )
     parser.add_argument(
-        "--num-gpt-layers", required=True, type=int,
+        "--num-gpt-layers",
+        required=True,
+        type=int,
         help="Number of decoder layers in the GPTModel (e.g. 4).",
     )
     args = parser.parse_args()
diff --git a/tools/checkpoint/saver_base.py b/tools/checkpoint/saver_base.py
index 8f60d0515db..b67d75a287a 100644
--- a/tools/checkpoint/saver_base.py
+++ b/tools/checkpoint/saver_base.py
@@ -1,13 +1,14 @@
 # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
 import json
 import os
-from importlib.metadata import version
-from packaging.version import Version as PkgVersion
 import sys
-import torch
+from importlib.metadata import version
 
+import torch
+from packaging.version import Version as PkgVersion
 from utils import _ConverterFakeProcessGroup, chunk_bias, chunk_weight
 
+
 class MegatronCheckpointSaverBase:
     """Orchestrates saving a Megatron checkpoint using parameters received on a multiprocessing queue.
 
@@ -22,11 +23,13 @@ def __init__(self, args, queue, build_tokenizer=False):
         self.queue = queue
         self.build_tokenizer = build_tokenizer
 
-        self.margs = None            # Will hold Megatron's main args
-        self.md = None               # Metadata received from the loader
+        self.margs = None  # Will hold Megatron's main args
+        self.md = None  # Metadata received from the loader
 
         self.models = None
-        self.model_provider = None   # model_provider function either from pretrain_gpt or pretrain_bert
+        self.model_provider = (
+            None  # model_provider function either from pretrain_gpt or pretrain_bert
+        )
 
     def _maybe_parse_additional_megatron_args(self, margs):
         """
@@ -41,14 +44,14 @@ def insert_megatron_path_and_check_te(self):
         """
         # Transformer engine >= 0.12.0, for CPU initialization.
         te_version = PkgVersion(version("transformer-engine"))
-        assert te_version >= PkgVersion("0.12.0"), \
+        assert te_version >= PkgVersion("0.12.0"), (
             "transformer engine version: %s (>=0.12.0 required)." % te_version
+        )
 
         # Search in directory above this
-        sys.path.append(os.path.abspath(
-            os.path.join(os.path.dirname(__file__),
-                         os.path.pardir,
-                         os.path.pardir)))
+        sys.path.append(
+            os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+        )
         if self.args.megatron_path is not None:
             sys.path.insert(0, self.args.megatron_path)
 
@@ -59,20 +62,41 @@ def _load_checkpoint_args(self, margs):
         if hasattr(self.md, 'checkpoint_args'):
             # These are arguments that we are either changing, or cause problems for validation if they are set
             # Note that some of these deal with T5 so will need to be changed if we support T5.
-            args_to_keep = ['tensor_model_parallel_size', 'pipeline_model_parallel_size', 'expert_model_parallel_size', 'world_size', 'params_dtype',
-                            'num_layers_per_virtual_pipeline_stage', 'virtual_pipeline_model_parallel_size',
-                            'masked_softmax_fusion', 'bias_gelu_fusion', 'bias_dropout_fusion',
-                            'sequence_parallel',
-                            'no_load_optim', 'no_load_rng', 'no_save_optim', 'no_save_rng',
-                            'vocab_file', 'tokenizer_model',
-                            'save_interval', 'save',
-                            'perform_initialization', 'use_cpu_initialization',
-                            'recompute_granularity', 'recompute_num_layers', 'recompute_method',
-                            'encoder_num_layers', 'encoder_seq_length',
-                            'distribute_saved_activations',
-                            'train_iters', 'lr_decay_iters', 'lr_warmup_iters', 'lr_warmup_fraction',
-                            'start_weight_decay', 'end_weight_decay',
-                            'ckpt_format',
+            args_to_keep = [
+                'tensor_model_parallel_size',
+                'pipeline_model_parallel_size',
+                'expert_model_parallel_size',
+                'world_size',
+                'params_dtype',
+                'num_layers_per_virtual_pipeline_stage',
+                'virtual_pipeline_model_parallel_size',
+                'masked_softmax_fusion',
+                'bias_gelu_fusion',
+                'bias_dropout_fusion',
+                'sequence_parallel',
+                'no_load_optim',
+                'no_load_rng',
+                'no_save_optim',
+                'no_save_rng',
+                'vocab_file',
+                'tokenizer_model',
+                'save_interval',
+                'save',
+                'perform_initialization',
+                'use_cpu_initialization',
+                'recompute_granularity',
+                'recompute_num_layers',
+                'recompute_method',
+                'encoder_num_layers',
+                'encoder_seq_length',
+                'distribute_saved_activations',
+                'train_iters',
+                'lr_decay_iters',
+                'lr_warmup_iters',
+                'lr_warmup_fraction',
+                'start_weight_decay',
+                'end_weight_decay',
+                'ckpt_format',
             ]
 
             for arg, value in vars(self.md.checkpoint_args).items():
@@ -82,7 +106,9 @@ def _load_checkpoint_args(self, margs):
                     print(f"Checkpoint had argument {arg} but new arguments does not have this.")
                     continue
                 if getattr(margs, arg) != value:
-                    print(f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}.")
+                    print(
+                        f"Overwriting default {arg} value {getattr(margs, arg)} with value from checkpoint {value}."
+                    )
                     setattr(margs, arg, value)
 
         return margs
@@ -138,8 +164,8 @@ def initialize_megatron_env(self):
         Initialize Megatron global variables and fused kernels.
         """
         try:
-            from megatron.training.global_vars import set_global_variables, get_args
             from megatron.core import mpu
+            from megatron.training.global_vars import get_args, set_global_variables
         except ModuleNotFoundError as e:
             print(f"Unable to import required Megatron modules: {e}")
             sys.exit(1)
@@ -152,8 +178,10 @@ def initialize_megatron_env(self):
         if hasattr(self.md, 'consumed_train_samples'):
             self.margs.consumed_train_samples = self.md.consumed_train_samples
             self.margs.consumed_valid_samples = self.md.consumed_valid_samples
-            print(f"Setting consumed_train_samples to {self.margs.consumed_train_samples}"
-                  f" and consumed_valid_samples to {self.margs.consumed_valid_samples}")
+            print(
+                f"Setting consumed_train_samples to {self.margs.consumed_train_samples}"
+                f" and consumed_valid_samples to {self.margs.consumed_valid_samples}"
+            )
         else:
             print("consumed_train_samples not provided.")
 
@@ -174,7 +202,7 @@ def initialize_megatron_env(self):
         mpu.set_tensor_model_parallel_rank(0)
         mpu.set_pipeline_model_parallel_rank(0)
         mpu.set_expert_model_parallel_rank(0)
-        
+
         # For backward compatibility during local parallel states refactoring
         fake_tp_group = _ConverterFakeProcessGroup(size=self.args.target_tensor_parallel_size)
         fake_pp_group = _ConverterFakeProcessGroup(size=self.args.target_pipeline_parallel_size)
@@ -188,9 +216,10 @@ def initialize_megatron_env(self):
         mpu._DATA_PARALLEL_GROUP_WITH_CP = fake_dp_group
         mpu._INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP = fake_dp_group
         mpu._EXPERT_DATA_PARALLEL_GROUP = fake_dp_ep_group
-        
+
         try:
             import torch_llm_debug_tools
+
             torch_llm_debug_tools.vscode_debugger_local_init()
         except ImportError:
             pass
@@ -225,42 +254,60 @@ def check_message(self, msg):
             print(f"Exiting. If you want to ignore this, use the argument --no-checking.")
             exit(1)
 
-    def build_sys_argv(self): 
+    def build_sys_argv(self):
         """
         Construct a sys.argv list for Megatron's argument parser.
         This centralizes the hack of overwriting sys.argv.
         """
         # We want all arguments to come from us
-        my_argv = ['script.py',
-                    '--num-layers', str(self.md.num_layers),
-                    '--hidden-size', str(self.md.hidden_size),
-                    '--seq-length', str(self.md.seq_length),
-                    '--num-experts', str(getattr(self.md, "num_experts", 0)),
-                    '--num-attention-heads', str(self.md.num_attention_heads),
-                    '--max-position-embeddings', str(self.md.max_position_embeddings),
-                    '--position-embedding-type', str(self.md.position_embedding_type),
-                    '--tokenizer-type', str(self.md.tokenizer_type),
-                    '--tensor-model-parallel-size', str(self.args.target_tensor_parallel_size),
-                    '--pipeline-model-parallel-size', str(self.args.target_pipeline_parallel_size),
-                    '--expert-model-parallel-size', str(self.args.target_expert_parallel_size),
-                    '--no-masked-softmax-fusion',
-                    '--no-bias-gelu-fusion',
-                    '--no-bias-dropout-fusion',
-                    '--use-cpu-initialization',
-                    '--micro-batch-size', '1',
-                    '--no-load-optim',
-                    '--no-load-rng',
-                    '--no-save-optim',
-                    '--no-save-rng',
-                    '--no-initialization',
-                    '--save-interval', '1',
-                    '--save', self.args.save_dir,
-                    '--ckpt-format', 'torch', # only 'torch' supported for conversion
-                    '--no-one-logger',
-                    ]
+        my_argv = [
+            'script.py',
+            '--num-layers',
+            str(self.md.num_layers),
+            '--hidden-size',
+            str(self.md.hidden_size),
+            '--seq-length',
+            str(self.md.seq_length),
+            '--num-experts',
+            str(getattr(self.md, "num_experts", 0)),
+            '--num-attention-heads',
+            str(self.md.num_attention_heads),
+            '--max-position-embeddings',
+            str(self.md.max_position_embeddings),
+            '--position-embedding-type',
+            str(self.md.position_embedding_type),
+            '--tokenizer-type',
+            str(self.md.tokenizer_type),
+            '--tensor-model-parallel-size',
+            str(self.args.target_tensor_parallel_size),
+            '--pipeline-model-parallel-size',
+            str(self.args.target_pipeline_parallel_size),
+            '--expert-model-parallel-size',
+            str(self.args.target_expert_parallel_size),
+            '--no-masked-softmax-fusion',
+            '--no-bias-gelu-fusion',
+            '--no-bias-dropout-fusion',
+            '--use-cpu-initialization',
+            '--micro-batch-size',
+            '1',
+            '--no-load-optim',
+            '--no-load-rng',
+            '--no-save-optim',
+            '--no-save-rng',
+            '--no-initialization',
+            '--save-interval',
+            '1',
+            '--save',
+            self.args.save_dir,
+            '--ckpt-format',
+            'torch',  # only 'torch' supported for conversion
+            '--no-one-logger',
+        ]
 
         if self.md.make_vocab_size_divisible_by is not None:
-            my_argv.extend(['--make-vocab-size-divisible-by', str(self.md.make_vocab_size_divisible_by)])
+            my_argv.extend(
+                ['--make-vocab-size-divisible-by', str(self.md.make_vocab_size_divisible_by)]
+            )
         if self.md.params_dtype == torch.float16:
             my_argv.append('--fp16')
         elif self.md.params_dtype == torch.bfloat16:
@@ -285,29 +332,46 @@ def receive_checkpoint_metadata(self):
             if hasattr(self.md, 'previous_tensor_parallel_size'):
                 self.args.target_tensor_parallel_size = self.md.previous_tensor_parallel_size
             else:
-                print("loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. "
-                      "Default to 1.")
+                print(
+                    "loader did not provide a tensor parallel size and --target-tensor-parallel-size not provided on command line. "
+                    "Default to 1."
+                )
                 self.args.target_tensor_parallel_size = 1
 
         if self.args.target_pipeline_parallel_size is None:
             if hasattr(self.md, 'previous_pipeline_parallel_size'):
                 self.args.target_pipeline_parallel_size = self.md.previous_pipeline_parallel_size
             else:
-                print("loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. "
-                      "Default to 1.")
+                print(
+                    "loader did not provide a pipeline parallel size and --target-pipeline-parallel-size not provided on command line. "
+                    "Default to 1."
+                )
                 self.args.target_pipeline_parallel_size = 1
 
         # Arguments do sanity checks on the world size, but we don't care,
         # so trick it into thinking we are plenty of processes
-        if self.args.target_tensor_parallel_size is not None and self.args.target_pipeline_parallel_size is not None:
+        if (
+            self.args.target_tensor_parallel_size is not None
+            and self.args.target_pipeline_parallel_size is not None
+        ):
             if self.args.target_expert_parallel_size is not None:
-                os.environ["WORLD_SIZE"] = f'{self.args.target_tensor_parallel_size * self.args.target_pipeline_parallel_size * self.args.target_expert_parallel_size}'
+                os.environ["WORLD_SIZE"] = (
+                    f'{self.args.target_tensor_parallel_size * self.args.target_pipeline_parallel_size * self.args.target_expert_parallel_size}'
+                )
             else:
-                os.environ["WORLD_SIZE"] = f'{self.args.target_tensor_parallel_size * self.args.target_pipeline_parallel_size}'
+                os.environ["WORLD_SIZE"] = (
+                    f'{self.args.target_tensor_parallel_size * self.args.target_pipeline_parallel_size}'
+                )
 
     def initialize_models(self):
         """Construct a 3D(PPxEPxTP) array for models, fill it with None"""
-        return [[[None for _ in range(self.args.target_tensor_parallel_size)] for _ in range(self.args.target_expert_parallel_size)] for _ in range(self.args.target_pipeline_parallel_size)]
+        return [
+            [
+                [None for _ in range(self.args.target_tensor_parallel_size)]
+                for _ in range(self.args.target_expert_parallel_size)
+            ]
+            for _ in range(self.args.target_pipeline_parallel_size)
+        ]
 
     def get_local_model(self, pp_rank, ep_rank, tp_rank):
         """
@@ -316,7 +380,9 @@ def get_local_model(self, pp_rank, ep_rank, tp_rank):
         if self.models[pp_rank][ep_rank][tp_rank] is None:
             pre_process = True if pp_rank == 0 else False
             post_process = True if pp_rank == self.args.target_pipeline_parallel_size - 1 else False
-            self.models[pp_rank][ep_rank][tp_rank] = self.model_provider(pre_process, post_process).to(self.md.params_dtype)
+            self.models[pp_rank][ep_rank][tp_rank] = self.model_provider(
+                pre_process, post_process
+            ).to(self.md.params_dtype)
         return self.models[pp_rank][ep_rank][tp_rank]
 
     def save(self):
@@ -344,8 +410,8 @@ def save_local_models_to_checkpoint(self):
         Save local models in self.models to a megatron checkpoint.
         """
         try:
-            from megatron.training.checkpointing import save_checkpoint
             from megatron.core import mpu
+            from megatron.training.checkpointing import save_checkpoint
         except ModuleNotFoundError as e:
             print(f"Unable to import required Megatron modules: {e}")
             sys.exit(1)
@@ -353,13 +419,21 @@ def save_local_models_to_checkpoint(self):
         for pp_rank in range(self.args.target_pipeline_parallel_size):
             mpu.set_pipeline_model_parallel_rank(pp_rank)
             # initial the first module in pp stage to get the layer_num, pooler, lm_head. binary_head
-            self.get_local_model(pp_rank,0,0)
+            self.get_local_model(pp_rank, 0, 0)
             for ep_rank in range(self.args.target_expert_parallel_size):
                 for tp_rank in range(self.args.target_tensor_parallel_size):
-                    save_checkpoint(self.md.iteration, [self.get_local_model(pp_rank, ep_rank, tp_rank)], None, None, num_floating_point_operations_so_far=0,
-                        pipeline_rank=pp_rank, pipeline_parallel=self.args.target_pipeline_parallel_size > 1,
-                        expert_rank=ep_rank, expert_parallel=self.args.target_expert_parallel_size > 1,
-                        tensor_rank=tp_rank)
+                    save_checkpoint(
+                        self.md.iteration,
+                        [self.get_local_model(pp_rank, ep_rank, tp_rank)],
+                        None,
+                        None,
+                        num_floating_point_operations_so_far=0,
+                        pipeline_rank=pp_rank,
+                        pipeline_parallel=self.args.target_pipeline_parallel_size > 1,
+                        expert_rank=ep_rank,
+                        expert_parallel=self.args.target_expert_parallel_size > 1,
+                        tensor_rank=tp_rank,
+                    )
                     # release the uselese model parts
                     self.models[pp_rank][ep_rank][tp_rank] = None
 
@@ -375,7 +449,7 @@ def receive_lm(self, schema, prefix=None):
             sys.exit(1)
 
         # Embeddings
-        #-----------
+        # -----------
         embeddings_msg = self.queue_get("embeddings")
         pos_embed = None
         if self.md.position_embedding_type == 'learned_absolute':
@@ -392,22 +466,24 @@ def pad_weight(orig_word_embed, true_vocab_size):
 
                 # Cut out extra padding we don't need
                 if orig_vocab_size > self.margs.padded_vocab_size:
-                    full_word_embed = orig_word_embed[0:self.margs.padded_vocab_size,:]
+                    full_word_embed = orig_word_embed[0 : self.margs.padded_vocab_size, :]
 
                 # Expanding embedding to larger size by replicating final entry
                 elif orig_vocab_size < self.margs.padded_vocab_size:
                     padding_size = self.margs.padded_vocab_size - orig_vocab_size
 
-                    full_word_embed = torch.cat((
-                        orig_word_embed,
-                        orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1)))
+                    full_word_embed = torch.cat(
+                        (orig_word_embed, orig_word_embed[-1].unsqueeze(0).expand(padding_size, -1))
+                    )
 
                 # Same size!
                 else:
                     full_word_embed = orig_word_embed
             else:
-                print("Original vocab size not specified, leaving embedding table as-is. "
-                    "If you've changed the tensor parallel size this could cause problems.")
+                print(
+                    "Original vocab size not specified, leaving embedding table as-is. "
+                    "If you've changed the tensor parallel size this could cause problems."
+                )
                 self.margs.padded_vocab_size = orig_word_embed.shape[0]
                 full_word_embed = orig_word_embed
             return full_word_embed
@@ -424,10 +500,7 @@ def pad_weight(orig_word_embed, true_vocab_size):
                 model = self.get_local_model(0, ep_rank, tp_rank)
                 if pos_embed is None:
                     assert not schema.has_position_embeddings(model)
-                schema.set("embeddings", model, {
-                    "pos" : pos_embed,
-                    "word" : out_word_embed[tp_rank],
-                })
+                schema.set("embeddings", model, {"pos": pos_embed, "word": out_word_embed[tp_rank]})
 
         # Transformer layers.
         # ------------------
@@ -435,7 +508,7 @@ def pad_weight(orig_word_embed, true_vocab_size):
         for pp_rank in range(self.args.target_pipeline_parallel_size):
             mpu.set_pipeline_model_parallel_rank(pp_rank)
             # initial the first module in pp stage to get the layer_num, pooler, lm_head. binary_head
-            self.get_local_model(pp_rank,0,0)
+            self.get_local_model(pp_rank, 0, 0)
             for layer_id in range(schema.get_num_layers(self.models[pp_rank][0][0])):
                 msg = self.queue_get(f"transformer layer {total_layer_num}")
 
@@ -447,103 +520,159 @@ def pad_weight(orig_word_embed, true_vocab_size):
                     post_norm_bias = msg.pop("post norm bias")
 
                 # Split up the parallel tensors
-                qkv_weight = chunk_weight(msg.pop("qkv weight"), "column", self.args.target_tensor_parallel_size)
-                dense_weight = chunk_weight(msg.pop("dense weight"), "row", self.args.target_tensor_parallel_size)
-                mlp_l1_weight = chunk_weight(msg.pop("mlp l1 weight"), "row", self.args.target_tensor_parallel_size, self.args.target_expert_parallel_size)
+                qkv_weight = chunk_weight(
+                    msg.pop("qkv weight"), "column", self.args.target_tensor_parallel_size
+                )
+                dense_weight = chunk_weight(
+                    msg.pop("dense weight"), "row", self.args.target_tensor_parallel_size
+                )
+                mlp_l1_weight = chunk_weight(
+                    msg.pop("mlp l1 weight"),
+                    "row",
+                    self.args.target_tensor_parallel_size,
+                    self.args.target_expert_parallel_size,
+                )
 
                 if self.margs.num_experts:
                     router = msg.pop("router weight")
 
                 # Special handling for swiglu
                 if self.md.swiglu:
-                    mlp_l0_weight_W = chunk_weight(msg.pop("mlp l0 weight W"), "column", self.args.target_tensor_parallel_size, self.args.target_expert_parallel_size)
-                    mlp_l0_weight_V = chunk_weight(msg.pop("mlp l0 weight V"), "column", self.args.target_tensor_parallel_size, self.args.target_expert_parallel_size)
+                    mlp_l0_weight_W = chunk_weight(
+                        msg.pop("mlp l0 weight W"),
+                        "column",
+                        self.args.target_tensor_parallel_size,
+                        self.args.target_expert_parallel_size,
+                    )
+                    mlp_l0_weight_V = chunk_weight(
+                        msg.pop("mlp l0 weight V"),
+                        "column",
+                        self.args.target_tensor_parallel_size,
+                        self.args.target_expert_parallel_size,
+                    )
                     mlp_l0_weight = torch.cat((mlp_l0_weight_W, mlp_l0_weight_V), dim=-2)
                 else:
-                    mlp_l0_weight = chunk_weight(msg.pop("mlp l0 weight"), "column", self.args.target_tensor_parallel_size, self.args.target_expert_parallel_size)
+                    mlp_l0_weight = chunk_weight(
+                        msg.pop("mlp l0 weight"),
+                        "column",
+                        self.args.target_tensor_parallel_size,
+                        self.args.target_expert_parallel_size,
+                    )
 
                 if self.md.qkv_bias:
-                    qkv_bias = chunk_bias(msg.pop("qkv bias"), 'column', self.args.target_tensor_parallel_size)
+                    qkv_bias = chunk_bias(
+                        msg.pop("qkv bias"), 'column', self.args.target_tensor_parallel_size
+                    )
                 if self.md.linear_bias:
                     dense_bias = msg.pop("dense bias")
-                    mlp_l1_bias = chunk_bias(msg.pop("mlp l1 bias"), 'row', self.args.target_tensor_parallel_size, self.args.target_expert_parallel_size)
+                    mlp_l1_bias = chunk_bias(
+                        msg.pop("mlp l1 bias"),
+                        'row',
+                        self.args.target_tensor_parallel_size,
+                        self.args.target_expert_parallel_size,
+                    )
                     if self.md.swiglu:
-                        mlp_l0_bias_W = chunk_bias(msg.pop("mlp l0 bias W"), 'column', self.args.target_tensor_parallel_size, self.args.target_expert_parallel_size)
-                        mlp_l0_bias_V = chunk_bias(msg.pop("mlp l0 bias V"), 'column', self.args.target_tensor_parallel_size, self.args.target_expert_parallel_size)
+                        mlp_l0_bias_W = chunk_bias(
+                            msg.pop("mlp l0 bias W"),
+                            'column',
+                            self.args.target_tensor_parallel_size,
+                            self.args.target_expert_parallel_size,
+                        )
+                        mlp_l0_bias_V = chunk_bias(
+                            msg.pop("mlp l0 bias V"),
+                            'column',
+                            self.args.target_tensor_parallel_size,
+                            self.args.target_expert_parallel_size,
+                        )
                         mlp_l0_bias = torch.cat((mlp_l0_bias_W, mlp_l0_bias_V), dim=-1)
                     else:
-                        mlp_l0_bias = chunk_bias(msg.pop("mlp l0 bias"), 'column', self.args.target_tensor_parallel_size, self.args.target_expert_parallel_size)
+                        mlp_l0_bias = chunk_bias(
+                            msg.pop("mlp l0 bias"),
+                            'column',
+                            self.args.target_tensor_parallel_size,
+                            self.args.target_expert_parallel_size,
+                        )
 
                 # Save them to the model
                 for ep_rank in range(self.args.target_expert_parallel_size):
                     for tp_rank in range(self.args.target_tensor_parallel_size):
                         params_dict = {
-                            "self_attn_norm_weight" : input_norm_weight,
-                            "self_attn_qkv_weight" : qkv_weight[tp_rank],
-                            "self_attn_proj_weight" : dense_weight[tp_rank],
-                            "mlp_norm_weight" : post_norm_weight
+                            "self_attn_norm_weight": input_norm_weight,
+                            "self_attn_qkv_weight": qkv_weight[tp_rank],
+                            "self_attn_proj_weight": dense_weight[tp_rank],
+                            "mlp_norm_weight": post_norm_weight,
                         }
                         if self.margs.num_experts:
-                            params_dict.update({
-                                "mlp_fc1_weight" : mlp_l0_weight[ep_rank][tp_rank],
-                                "mlp_fc2_weight" : mlp_l1_weight[ep_rank][tp_rank]
-                            })
+                            params_dict.update(
+                                {
+                                    "mlp_fc1_weight": mlp_l0_weight[ep_rank][tp_rank],
+                                    "mlp_fc2_weight": mlp_l1_weight[ep_rank][tp_rank],
+                                }
+                            )
                         else:
-                            params_dict.update({
-                                "mlp_fc1_weight" : mlp_l0_weight[tp_rank],
-                                "mlp_fc2_weight" : mlp_l1_weight[tp_rank]
-                            })
-                        params_dict.update({
-                            "self_attn_norm_bias" : input_norm_bias if self.md.norm_has_bias else None,
-                            "mlp_norm_bias" : post_norm_bias if self.md.norm_has_bias else None,
-                        })
+                            params_dict.update(
+                                {
+                                    "mlp_fc1_weight": mlp_l0_weight[tp_rank],
+                                    "mlp_fc2_weight": mlp_l1_weight[tp_rank],
+                                }
+                            )
+                        params_dict.update(
+                            {
+                                "self_attn_norm_bias": (
+                                    input_norm_bias if self.md.norm_has_bias else None
+                                ),
+                                "mlp_norm_bias": post_norm_bias if self.md.norm_has_bias else None,
+                            }
+                        )
                         if self.md.qkv_bias:
-                            params_dict.update({
-                                "self_attn_qkv_bias" : qkv_bias[tp_rank]
-                            })
+                            params_dict.update({"self_attn_qkv_bias": qkv_bias[tp_rank]})
                         if self.md.linear_bias:
-                            params_dict.update({
-                                "self_attn_proj_bias" : dense_bias
-                            })
+                            params_dict.update({"self_attn_proj_bias": dense_bias})
                             if self.margs.num_experts:
-                                params_dict.update({
-                                    "mlp_fc1_bias" : mlp_l0_bias[ep_rank][tp_rank],
-                                    "mlp_fc2_bias" : mlp_l1_bias[ep_rank]
-                                })
-                            else :
-                                params_dict.update({
-                                    "mlp_fc1_bias" : mlp_l0_bias[tp_rank],
-                                    "mlp_fc2_bias" : mlp_l1_bias
-                                })
+                                params_dict.update(
+                                    {
+                                        "mlp_fc1_bias": mlp_l0_bias[ep_rank][tp_rank],
+                                        "mlp_fc2_bias": mlp_l1_bias[ep_rank],
+                                    }
+                                )
+                            else:
+                                params_dict.update(
+                                    {
+                                        "mlp_fc1_bias": mlp_l0_bias[tp_rank],
+                                        "mlp_fc2_bias": mlp_l1_bias,
+                                    }
+                                )
                         if self.margs.num_experts:
-                            params_dict.update({
-                                "router_weight":  router
-                            })
+                            params_dict.update({"router_weight": router})
                         model = self.get_local_model(pp_rank, ep_rank, tp_rank)
                         schema.set_layer(model, layer_id, params_dict)
 
                 total_layer_num = total_layer_num + 1
                 self.check_message(msg)
 
-
             if pp_rank == self.args.target_pipeline_parallel_size - 1:
                 msg = self.queue_get("final norm")
                 final_norm_weight = msg.pop("weight")
                 if self.md.norm_has_bias:
                     final_norm_bias = msg.pop("bias")
-                pp_local_models = [self.get_local_model(pp_rank, ep_rank, tp_rank) for ep_rank in range(self.args.target_expert_parallel_size)
-                    for tp_rank in range(self.args.target_tensor_parallel_size)]
+                pp_local_models = [
+                    self.get_local_model(pp_rank, ep_rank, tp_rank)
+                    for ep_rank in range(self.args.target_expert_parallel_size)
+                    for tp_rank in range(self.args.target_tensor_parallel_size)
+                ]
                 for eptp_rank, model in enumerate(pp_local_models):
                     tp_rank = eptp_rank % self.args.target_tensor_parallel_size
-                    schema.set("final_norm", model, {
-                        "weight" : final_norm_weight,
-                        "bias" : final_norm_bias if self.md.norm_has_bias else None,
-                    })
+                    schema.set(
+                        "final_norm",
+                        model,
+                        {
+                            "weight": final_norm_weight,
+                            "bias": final_norm_bias if self.md.norm_has_bias else None,
+                        },
+                    )
                     if pp_rank != 0 and not self.md.output_layer:
                         # Copy word embeddings to final pipeline rank
-                        schema.set("output_layer", model, {
-                            "weight" : out_word_embed[tp_rank],
-                        })
+                        schema.set("output_layer", model, {"weight": out_word_embed[tp_rank]})
                 del final_norm_weight
                 if self.md.norm_has_bias:
                     del final_norm_bias
@@ -551,38 +680,56 @@ def pad_weight(orig_word_embed, true_vocab_size):
 
                 if self.md.output_layer:
                     msg = self.queue_get("output layer")
-                    if not hasattr(pp_local_models[0] if prefix is None else getattr(pp_local_models[0], prefix), 'output_layer'):
+                    if not hasattr(
+                        (
+                            pp_local_models[0]
+                            if prefix is None
+                            else getattr(pp_local_models[0], prefix)
+                        ),
+                        'output_layer',
+                    ):
                         print("ERROR: got an output layer, but model does not have one")
                         exit(1)
                     output_layer_weight = pad_weight(msg.pop("weight"), self.md.true_vocab_size)
-                    output_layer_weight = torch.chunk(output_layer_weight, self.args.target_tensor_parallel_size, dim=0)
+                    output_layer_weight = torch.chunk(
+                        output_layer_weight, self.args.target_tensor_parallel_size, dim=0
+                    )
                     for eptp_rank, model in enumerate(pp_local_models):
                         tp_rank = eptp_rank % self.args.target_tensor_parallel_size
-                        schema.set("output_layer", model, {
-                            "weight" : output_layer_weight[tp_rank],
-                        })
+                        schema.set("output_layer", model, {"weight": output_layer_weight[tp_rank]})
                     self.check_message(msg)
 
                 msg = self.queue_get()
                 if msg != "done" and msg["name"] == "pooler":
-                    if not hasattr(self.models[pp_rank][0][0] if prefix is None else getattr(self.models[pp_rank][0][0], prefix), 'pooler'):
+                    if not hasattr(
+                        (
+                            self.models[pp_rank][0][0]
+                            if prefix is None
+                            else getattr(self.models[pp_rank][0][0], prefix)
+                        ),
+                        'pooler',
+                    ):
                         print("ERROR: got a pooler, but model does not have one")
                         exit(1)
                     print("received pooler")
                     pooler_weight = msg.pop("weight")
                     pooler_bias = msg.pop("bias")
                     for model in pp_local_models:
-                        schema.set("pooler", model, {
-                            "weight" : pooler_weight,
-                            "bias" : pooler_bias,
-                        })
+                        schema.set("pooler", model, {"weight": pooler_weight, "bias": pooler_bias})
                     del pooler_weight
                     del pooler_bias
                     self.check_message(msg)
                     msg = self.queue_get()
 
                 if msg != "done" and msg["name"] == "lm head":
-                    if not hasattr(self.models[pp_rank][0][0] if prefix is None else getattr(self.models[pp_rank][0][0], prefix), 'lm_head'):
+                    if not hasattr(
+                        (
+                            self.models[pp_rank][0][0]
+                            if prefix is None
+                            else getattr(self.models[pp_rank][0][0], prefix)
+                        ),
+                        'lm_head',
+                    ):
                         print("ERROR: got an lm head, but model does not have one")
                         exit(1)
                     print("received lm head")
@@ -592,27 +739,39 @@ def pad_weight(orig_word_embed, true_vocab_size):
                     if self.md.norm_has_bias:
                         lm_head_norm_bias = msg.pop("norm bias")
                     for model in pp_local_models:
-                        schema.set("lm_head", model, {
-                            "dense_weight" : lm_head_dense_weight,
-                            "dense_bias" : lm_head_dense_bias,
-                            "norm_weight" : lm_head_norm_weight,
-                            "norm_bias" : lm_head_norm_bias if self.md.norm_has_bias else None,
-                        })
+                        schema.set(
+                            "lm_head",
+                            model,
+                            {
+                                "dense_weight": lm_head_dense_weight,
+                                "dense_bias": lm_head_dense_bias,
+                                "norm_weight": lm_head_norm_weight,
+                                "norm_bias": lm_head_norm_bias if self.md.norm_has_bias else None,
+                            },
+                        )
                     self.check_message(msg)
                     msg = self.queue_get()
 
                 if msg != "done" and msg["name"] == "binary head":
-                    if not hasattr(self.models[pp_rank][0][0] if prefix is None else getattr(self.models[pp_rank][0][0], prefix), 'binary_head'):
+                    if not hasattr(
+                        (
+                            self.models[pp_rank][0][0]
+                            if prefix is None
+                            else getattr(self.models[pp_rank][0][0], prefix)
+                        ),
+                        'binary_head',
+                    ):
                         print("ERROR: got a binary head, but model does not have one")
                         exit(1)
                     print("received binary head")
                     binary_head_weight = msg.pop("weight")
                     binary_head_bias = msg.pop("bias")
                     for model in pp_local_models:
-                        schema.set("binary_head", model, {
-                            "weight" : binary_head_weight,
-                            "bias" : binary_head_bias,
-                        })
+                        schema.set(
+                            "binary_head",
+                            model,
+                            {"weight": binary_head_weight, "bias": binary_head_bias},
+                        )
                     self.check_message(msg)
                     msg = self.queue_get()
 
diff --git a/tools/run_dynamic_text_generation_server.py b/tools/run_dynamic_text_generation_server.py
index ed5ab473d63..56edd6a116e 100644
--- a/tools/run_dynamic_text_generation_server.py
+++ b/tools/run_dynamic_text_generation_server.py
@@ -24,8 +24,10 @@ def add_text_generation_server_args(parser: argparse.ArgumentParser):
     parser = add_inference_args(parser)
     parser.add_argument("--port", type=int, default=5000, help="Port for Flask server to run on")
     parser.add_argument(
-        "--host", type=str, default=None,
-        help="Hostname or IP address to bind the server to. Defaults to 0.0.0.0 (all interfaces)."
+        "--host",
+        type=str,
+        default=None,
+        help="Hostname or IP address to bind the server to. Defaults to 0.0.0.0 (all interfaces).",
     )
     parser.add_argument(
         "--parsers", type=str, nargs="+", default=[], help="Parsers to use for parsing the response"
@@ -35,7 +37,10 @@ def add_text_generation_server_args(parser: argparse.ArgumentParser):
 
 @trace_async_exceptions
 async def run_text_generation_server(
-    engine: DynamicInferenceEngine, coordinator_port: int, server_port: int, hostname: str | None = None,
+    engine: DynamicInferenceEngine,
+    coordinator_port: int,
+    server_port: int,
+    hostname: str | None = None,
 ):
     """
     Runs the text generation server from rank 0 and initializes the
@@ -50,7 +55,8 @@ async def run_text_generation_server(
     rank = torch.distributed.get_rank()
 
     coordinator_addr = await engine.start_listening_to_data_parallel_coordinator(
-        inference_coordinator_port=coordinator_port, launch_inference_coordinator=True,
+        inference_coordinator_port=coordinator_port,
+        launch_inference_coordinator=True,
         hostname=hostname,
     )
 
@@ -93,7 +99,9 @@ async def run_text_generation_server(
 
         try:
             asyncio.run(
-                run_text_generation_server(engine, args.inference_coordinator_port, args.port, args.host)
+                run_text_generation_server(
+                    engine, args.inference_coordinator_port, args.port, args.host
+                )
             )
         except KeyboardInterrupt:
             # Catching at the top level ensures clean stdout without spamming the traceback
diff --git a/tools/run_inference_performance_test.py b/tools/run_inference_performance_test.py
index ac9e92d3639..4140740e284 100644
--- a/tools/run_inference_performance_test.py
+++ b/tools/run_inference_performance_test.py
@@ -26,7 +26,11 @@
 )
 from megatron.core.tokenizers.utils.build_tokenizer import build_tokenizer
 from megatron.core.transformer.module import MegatronModule
-from megatron.inference.utils import add_inference_args, get_dynamic_inference_engine, get_model_for_inference
+from megatron.inference.utils import (
+    add_inference_args,
+    get_dynamic_inference_engine,
+    get_model_for_inference,
+)
 from model_provider import model_provider
 
 sys.path.append(
@@ -38,8 +42,8 @@
 
 from megatron.core import mpu
 from megatron.training import get_args, get_model, get_tokenizer
-from megatron.training.checkpointing import load_checkpoint
 from megatron.training.arguments import parse_and_validate_args
+from megatron.training.checkpointing import load_checkpoint
 from megatron.training.initialize import initialize_megatron
 
 REQUEST_ID = 0
diff --git a/tools/run_text_generation_server.py b/tools/run_text_generation_server.py
index 5a2940f1a4c..b8ddd986fa6 100644
--- a/tools/run_text_generation_server.py
+++ b/tools/run_text_generation_server.py
@@ -39,8 +39,8 @@
 
 from megatron.core import mpu
 from megatron.training import get_args, get_model, get_tokenizer
-from megatron.training.checkpointing import load_checkpoint
 from megatron.training.arguments import parse_and_validate_args
+from megatron.training.checkpointing import load_checkpoint
 from megatron.training.initialize import initialize_megatron
 
 
@@ -61,16 +61,14 @@ def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngi
 
     tokenizer = get_tokenizer()
 
-    inference_context = StaticInferenceContext(args.inference_max_requests, args.inference_max_sequence_length)
-    inference_wrapped_model = GPTInferenceWrapper(
-        model, inference_context
+    inference_context = StaticInferenceContext(
+        args.inference_max_requests, args.inference_max_sequence_length
     )
+    inference_wrapped_model = GPTInferenceWrapper(model, inference_context)
     text_generation_controller = TextGenerationController(
         inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
     )
-    return StaticInferenceEngine(
-        text_generation_controller=text_generation_controller,
-    )
+    return StaticInferenceEngine(text_generation_controller=text_generation_controller)
 
 
 def add_text_generate_args(parser):
@@ -180,5 +178,6 @@ def main(model_type: str = "gpt"):
         elif choice.item() == 1:
             break
 
+
 if __name__ == "__main__":
     main(model_type="gpt")
diff --git a/tools/run_vlm_text_generation.py b/tools/run_vlm_text_generation.py
index 17c3b1030dc..6662d6befa3 100644
--- a/tools/run_vlm_text_generation.py
+++ b/tools/run_vlm_text_generation.py
@@ -188,9 +188,9 @@ def __call__(self, tokens, position_ids, attention_mask):
         # Update the sequence length offset by the number of image tokens.
         num_tokens = tokens.size(1)
         if num_tokens > 1:
-            self.inference_context.sequence_len_offset += self.inference_context.key_value_memory_dict[
-                "image_tokens_count"
-            ]
+            self.inference_context.sequence_len_offset += (
+                self.inference_context.key_value_memory_dict["image_tokens_count"]
+            )
 
         return logits
 
diff --git a/train_rl.py b/train_rl.py
index 8bcee5f096d..06457be1245 100644
--- a/train_rl.py
+++ b/train_rl.py
@@ -12,6 +12,7 @@
 from megatron.core import mpu
 from megatron.core.enums import ModelType
 from megatron.core.models.gpt import GPTModel
+from megatron.core.packed_seq_params import PackedSeqParams
 from megatron.core.parallel_state import is_pipeline_last_stage
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 from megatron.core.utils import StragglerDetector
@@ -21,20 +22,19 @@
     get_rl_runtime_state,
     load_packed_data_by_index,
 )
+from megatron.rl.sequence_packing_utils import get_default_packed_seq_params
 from megatron.training import get_args, get_timers, pretrain, print_rank_0
-from megatron.training.utils import is_hybrid_model
 from megatron.training.arguments import core_transformer_config_from_args, parse_and_validate_args
+from megatron.training.utils import is_hybrid_model
 from model_provider import model_provider
 
-from megatron.core.packed_seq_params import PackedSeqParams
-from megatron.rl.sequence_packing_utils import get_default_packed_seq_params
-
 stimer = StragglerDetector()
 
 import logging
 
 logging.basicConfig(level=logging.INFO, force=True)
 
+
 def _gpt_builder(args, pre_process, post_process, vp_stage=None, config=None, pg_collection=None):
     # TODO(Peter): This is a hack to get around the fact that we are activation recomputation for training but not
     # for inference with cuda graphs. Without out this the post checks in the transformer config will assert error.
@@ -223,7 +223,11 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False):
             seq_lengths,
             seq_indices,
             packed_seq_params,
-        ) = load_packed_data_by_index(bin_tensor.item(), runtime_state.packing_context, args.rl_inference_logprobs_is_correction)
+        ) = load_packed_data_by_index(
+            bin_tensor.item(),
+            runtime_state.packing_context,
+            args.rl_inference_logprobs_is_correction,
+        )
 
         runtime_state.increment_sequences(len(seq_indices))
     else:
@@ -289,8 +293,7 @@ def forward_step(data_iterator, model: GPTModel, loss_only: bool = False):
     # Get current logprobs and calculate loss with straggler detection
     with stimer:
         logprobs_or_hidden_states = get_logprobs(
-            model_to_use, tokens, position_ids, no_grad=False,
-            packed_seq_params=packed_seq_params
+            model_to_use, tokens, position_ids, no_grad=False, packed_seq_params=packed_seq_params
         )
 
         if not is_pipeline_last_stage():
@@ -410,10 +413,7 @@ def _model_builder(
                 pg_collection=pg_collection,
             )
 
-    parse_and_validate_args(
-        extra_args_provider=add_inference_args,
-        args_defaults={},
-    )
+    parse_and_validate_args(extra_args_provider=add_inference_args, args_defaults={})
     pretrain(
         None,  # we don't need to build any datasets for RL training
         partial(model_provider, _model_builder),
diff --git a/uv.lock b/uv.lock
index cad7702bc60..69da4be0606 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,6 +1,6 @@
 version = 1
 revision = 2
-requires-python = ">=3.12"
+requires-python = ">=3.10"
 resolution-markers = [
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
@@ -10,16 +10,24 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
@@ -27,26 +35,38 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
 ]
 conflicts = [[
     { package = "megatron-core", extra = "dev" },
@@ -75,10 +95,9 @@ version = "1.13.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "huggingface-hub" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging" },
     { name = "psutil" },
     { name = "pyyaml" },
     { name = "safetensors" },
@@ -103,7 +122,7 @@ wheels = [
 
 [[package]]
 name = "aiobotocore"
-version = "3.4.0"
+version = "3.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohttp" },
@@ -112,11 +131,12 @@ dependencies = [
     { name = "jmespath" },
     { name = "multidict" },
     { name = "python-dateutil" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
     { name = "wrapt" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b8/50/a48ed11b15f926ce3dbb33e7fb0f25af17dbb99bcb7ae3b30c763723eca7/aiobotocore-3.4.0.tar.gz", hash = "sha256:a918b5cb903f81feba7e26835aed4b5e6bb2d0149d7f42bb2dd7d8089e3d9000", size = 122360, upload-time = "2026-04-07T06:12:24.884Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/71/9f/a0568deaf008f4a7e3d57a7f80f1537df894df0e49bd4a790bb22f9a2d8e/aiobotocore-3.3.0.tar.gz", hash = "sha256:9abc21d91edd6c9c2e4a07e11bdfcbb159f0b9116ab2a0a5a349113533a18fb2", size = 122940, upload-time = "2026-03-18T09:58:49.077Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/df/d8/ce9386e6d76ea79e61dee15e62aa48cff6be69e89246b0ac4a11857cb02c/aiobotocore-3.4.0-py3-none-any.whl", hash = "sha256:26290eb6830ea92d8a6f5f90b56e9f5cedd6d126074d5db63b195e281d982465", size = 88018, upload-time = "2026-04-07T06:12:22.684Z" },
+    { url = "https://files.pythonhosted.org/packages/16/54/a295bd8d7ac900c339b2c7024ed0ff9538afb60e92eb0979b8bb49deb20e/aiobotocore-3.3.0-py3-none-any.whl", hash = "sha256:9125ab2b63740dfe3b66b8d5a90d13aed9587b850aa53225ef214a04a1aa7fdc", size = 87817, upload-time = "2026-03-18T09:58:47.466Z" },
 ]
 
 [[package]]
@@ -144,6 +164,8 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "aiohappyeyeballs" },
     { name = "aiosignal" },
+    { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "async-timeout", version = "5.0.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "attrs" },
     { name = "frozenlist" },
     { name = "multidict" },
@@ -152,6 +174,40 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/77/9a/152096d4808df8e4268befa55fba462f440f14beab85e8ad9bf990516918/aiohttp-3.13.5.tar.gz", hash = "sha256:9d98cc980ecc96be6eb4c1994ce35d28d8b1f5e5208a23b421187d1209dbb7d1", size = 7858271, upload-time = "2026-03-31T22:01:03.343Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/bd/85/cebc47ee74d8b408749073a1a46c6fcba13d170dc8af7e61996c6c9394ac/aiohttp-3.13.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:02222e7e233295f40e011c1b00e3b0bd451f22cf853a0304c3595633ee47da4b", size = 750547, upload-time = "2026-03-31T21:56:30.024Z" },
+    { url = "https://files.pythonhosted.org/packages/05/98/afd308e35b9d3d8c9ec54c0918f1d722c86dc17ddfec272fcdbcce5a3124/aiohttp-3.13.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:bace460460ed20614fa6bc8cb09966c0b8517b8c58ad8046828c6078d25333b5", size = 503535, upload-time = "2026-03-31T21:56:31.935Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/4d/926c183e06b09d5270a309eb50fbde7b09782bfd305dec1e800f329834fb/aiohttp-3.13.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f546a4dc1e6a5edbb9fd1fd6ad18134550e096a5a43f4ad74acfbd834fc6670", size = 497830, upload-time = "2026-03-31T21:56:33.654Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/d6/f47d1c690f115a5c2a5e8938cce4a232a5be9aac5c5fb2647efcbbbda333/aiohttp-3.13.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c86969d012e51b8e415a8c6ce96f7857d6a87d6207303ab02d5d11ef0cad2274", size = 1682474, upload-time = "2026-03-31T21:56:35.513Z" },
+    { url = "https://files.pythonhosted.org/packages/01/44/056fd37b1bb52eac760303e5196acc74d9d546631b035704ae5927f7b4ac/aiohttp-3.13.5-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b6f6cd1560c5fa427e3b6074bb24d2c64e225afbb7165008903bd42e4e33e28a", size = 1655259, upload-time = "2026-03-31T21:56:37.843Z" },
+    { url = "https://files.pythonhosted.org/packages/91/9f/78eb1a20c1c28ae02f6a3c0f4d7b0dcc66abce5290cadd53d78ce3084175/aiohttp-3.13.5-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:636bc362f0c5bbc7372bc3ae49737f9e3030dbce469f0f422c8f38079780363d", size = 1736204, upload-time = "2026-03-31T21:56:39.822Z" },
+    { url = "https://files.pythonhosted.org/packages/de/6c/d20d7de23f0b52b8c1d9e2033b2db1ac4dacbb470bb74c56de0f5f86bb4f/aiohttp-3.13.5-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6a7cbeb06d1070f1d14895eeeed4dac5913b22d7b456f2eb969f11f4b3993796", size = 1826198, upload-time = "2026-03-31T21:56:41.378Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/86/a6f3ff1fd795f49545a7c74b2c92f62729135d73e7e4055bf74da5a26c82/aiohttp-3.13.5-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bca9ef7517fd7874a1a08970ae88f497bf5c984610caa0bf40bd7e8450852b95", size = 1681329, upload-time = "2026-03-31T21:56:43.374Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/68/84cd3dab6b7b4f3e6fe9459a961acb142aaab846417f6e8905110d7027e5/aiohttp-3.13.5-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:019a67772e034a0e6b9b17c13d0a8fe56ad9fb150fc724b7f3ffd3724288d9e5", size = 1560023, upload-time = "2026-03-31T21:56:45.031Z" },
+    { url = "https://files.pythonhosted.org/packages/41/2c/db61b64b0249e30f954a65ab4cb4970ced57544b1de2e3c98ee5dc24165f/aiohttp-3.13.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f34ecee82858e41dd217734f0c41a532bd066bcaab636ad830f03a30b2a96f2a", size = 1652372, upload-time = "2026-03-31T21:56:47.075Z" },
+    { url = "https://files.pythonhosted.org/packages/25/6f/e96988a6c982d047810c772e28c43c64c300c943b0ed5c1c0c4ce1e1027c/aiohttp-3.13.5-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:4eac02d9af4813ee289cd63a361576da36dba57f5a1ab36377bc2600db0cbb73", size = 1662031, upload-time = "2026-03-31T21:56:48.835Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/26/a56feace81f3d347b4052403a9d03754a0ab23f7940780dada0849a38c92/aiohttp-3.13.5-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:4beac52e9fe46d6abf98b0176a88154b742e878fdf209d2248e99fcdf73cd297", size = 1708118, upload-time = "2026-03-31T21:56:50.833Z" },
+    { url = "https://files.pythonhosted.org/packages/78/6e/b6173a8ff03d01d5e1a694bc06764b5dad1df2d4ed8f0ceec12bb3277936/aiohttp-3.13.5-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:c180f480207a9b2475f2b8d8bd7204e47aec952d084b2a2be58a782ffcf96074", size = 1548667, upload-time = "2026-03-31T21:56:52.81Z" },
+    { url = "https://files.pythonhosted.org/packages/16/13/13296ffe2c132d888b3fe2c195c8b9c0c24c89c3fa5cc2c44464dc23b22e/aiohttp-3.13.5-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:2837fb92951564d6339cedae4a7231692aa9f73cbc4fb2e04263b96844e03b4e", size = 1724490, upload-time = "2026-03-31T21:56:54.541Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/b4/1f1c287f4a79782ef36e5a6e62954c85343bc30470d862d30bd5f26c9fa2/aiohttp-3.13.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:d9010032a0b9710f58012a1e9c222528763d860ba2ee1422c03473eab47703e7", size = 1667109, upload-time = "2026-03-31T21:56:56.21Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/42/8461a2aaf60a8f4ea4549a4056be36b904b0eb03d97ca9a8a2604681a500/aiohttp-3.13.5-cp310-cp310-win32.whl", hash = "sha256:7c4b6668b2b2b9027f209ddf647f2a4407784b5d88b8be4efcc72036f365baf9", size = 439478, upload-time = "2026-03-31T21:56:58.292Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/71/06956304cb5ee439dfe8d86e1b2e70088bd88ed1ced1f42fb29e5d855f0e/aiohttp-3.13.5-cp310-cp310-win_amd64.whl", hash = "sha256:cd3db5927bf9167d5a6157ddb2f036f6b6b0ad001ac82355d43e97a4bde76d76", size = 462047, upload-time = "2026-03-31T21:57:00.257Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/f5/a20c4ac64aeaef1679e25c9983573618ff765d7aa829fa2b84ae7573169e/aiohttp-3.13.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ab7229b6f9b5c1ba4910d6c41a9eb11f543eadb3f384df1b4c293f4e73d44d6", size = 757513, upload-time = "2026-03-31T21:57:02.146Z" },
+    { url = "https://files.pythonhosted.org/packages/75/0a/39fa6c6b179b53fcb3e4b3d2b6d6cad0180854eda17060c7218540102bef/aiohttp-3.13.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8f14c50708bb156b3a3ca7230b3d820199d56a48e3af76fa21c2d6087190fe3d", size = 506748, upload-time = "2026-03-31T21:57:04.275Z" },
+    { url = "https://files.pythonhosted.org/packages/87/ec/e38ce072e724fd7add6243613f8d1810da084f54175353d25ccf9f9c7e5a/aiohttp-3.13.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e7d2f8616f0ff60bd332022279011776c3ac0faa0f1b463f7bb12326fbc97a1c", size = 501673, upload-time = "2026-03-31T21:57:06.208Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/ba/3bc7525d7e2beaa11b309a70d48b0d3cfc3c2089ec6a7d0820d59c657053/aiohttp-3.13.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a2567b72e1ffc3ab25510db43f355b29eeada56c0a622e58dcdb19530eb0a3cb", size = 1763757, upload-time = "2026-03-31T21:57:07.882Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/ab/e87744cf18f1bd78263aba24924d4953b41086bd3a31d22452378e9028a0/aiohttp-3.13.5-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:fb0540c854ac9c0c5ad495908fdfd3e332d553ec731698c0e29b1877ba0d2ec6", size = 1720152, upload-time = "2026-03-31T21:57:09.946Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/f3/ed17a6f2d742af17b50bae2d152315ed1b164b07a5fd5cc1754d99e4dfa5/aiohttp-3.13.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c9883051c6972f58bfc4ebb2116345ee2aa151178e99c3f2b2bbe2af712abd13", size = 1818010, upload-time = "2026-03-31T21:57:12.157Z" },
+    { url = "https://files.pythonhosted.org/packages/53/06/ecbc63dc937192e2a5cb46df4d3edb21deb8225535818802f210a6ea5816/aiohttp-3.13.5-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2294172ce08a82fb7c7273485895de1fa1186cc8294cfeb6aef4af42ad261174", size = 1907251, upload-time = "2026-03-31T21:57:14.023Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/a5/0521aa32c1ddf3aa1e71dcc466be0b7db2771907a13f18cddaa45967d97b/aiohttp-3.13.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3a807cabd5115fb55af198b98178997a5e0e57dead43eb74a93d9c07d6d4a7dc", size = 1759969, upload-time = "2026-03-31T21:57:16.146Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/78/a38f8c9105199dd3b9706745865a8a59d0041b6be0ca0cc4b2ccf1bab374/aiohttp-3.13.5-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:aa6d0d932e0f39c02b80744273cd5c388a2d9bc07760a03164f229c8e02662f6", size = 1616871, upload-time = "2026-03-31T21:57:17.856Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/41/27392a61ead8ab38072105c71aa44ff891e71653fe53d576a7067da2b4e8/aiohttp-3.13.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:60869c7ac4aaabe7110f26499f3e6e5696eae98144735b12a9c3d9eae2b51a49", size = 1739844, upload-time = "2026-03-31T21:57:19.679Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/55/5564e7ae26d94f3214250009a0b1c65a0c6af4bf88924ccb6fdab901de28/aiohttp-3.13.5-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:26d2f8546f1dfa75efa50c3488215a903c0168d253b75fba4210f57ab77a0fb8", size = 1731969, upload-time = "2026-03-31T21:57:22.006Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/c5/705a3929149865fc941bcbdd1047b238e4a72bcb215a9b16b9d7a2e8d992/aiohttp-3.13.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1162a1492032c82f14271e831c8f4b49f2b6078f4f5fc74de2c912fa225d51d", size = 1795193, upload-time = "2026-03-31T21:57:24.256Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/19/edabed62f718d02cff7231ca0db4ef1c72504235bc467f7b67adb1679f48/aiohttp-3.13.5-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:8b14eb3262fad0dc2f89c1a43b13727e709504972186ff6a99a3ecaa77102b6c", size = 1606477, upload-time = "2026-03-31T21:57:26.364Z" },
+    { url = "https://files.pythonhosted.org/packages/de/fc/76f80ef008675637d88d0b21584596dc27410a990b0918cb1e5776545b5b/aiohttp-3.13.5-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:ca9ac61ac6db4eb6c2a0cd1d0f7e1357647b638ccc92f7e9d8d133e71ed3c6ac", size = 1813198, upload-time = "2026-03-31T21:57:28.316Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/67/5b3ac26b80adb20ea541c487f73730dc8fa107d632c998f25bbbab98fcda/aiohttp-3.13.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:7996023b2ed59489ae4762256c8516df9820f751cf2c5da8ed2fb20ee50abab3", size = 1752321, upload-time = "2026-03-31T21:57:30.549Z" },
+    { url = "https://files.pythonhosted.org/packages/88/06/e4a2e49255ea23fa4feeb5ab092d90240d927c15e47b5b5c48dff5a9ce29/aiohttp-3.13.5-cp311-cp311-win32.whl", hash = "sha256:77dfa48c9f8013271011e51c00f8ada19851f013cde2c48fca1ba5e0caf5bb06", size = 439069, upload-time = "2026-03-31T21:57:32.388Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/43/8c7163a596dab4f8be12c190cf467a1e07e4734cf90eebb39f7f5d53fc6a/aiohttp-3.13.5-cp311-cp311-win_amd64.whl", hash = "sha256:d3a4834f221061624b8887090637db9ad4f61752001eae37d56c52fddade2dc8", size = 462859, upload-time = "2026-03-31T21:57:34.455Z" },
     { url = "https://files.pythonhosted.org/packages/be/6f/353954c29e7dcce7cf00280a02c75f30e133c00793c7a2ed3776d7b2f426/aiohttp-3.13.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:023ecba036ddd840b0b19bf195bfae970083fd7024ce1ac22e9bba90464620e9", size = 748876, upload-time = "2026-03-31T21:57:36.319Z" },
     { url = "https://files.pythonhosted.org/packages/f5/1b/428a7c64687b3b2e9cd293186695affc0e1e54a445d0361743b231f11066/aiohttp-3.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15c933ad7920b7d9a20de151efcd05a6e38302cbf0e10c9b2acb9a42210a2416", size = 499557, upload-time = "2026-03-31T21:57:38.236Z" },
     { url = "https://files.pythonhosted.org/packages/29/47/7be41556bfbb6917069d6a6634bb7dd5e163ba445b783a90d40f5ac7e3a7/aiohttp-3.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab2899f9fa2f9f741896ebb6fa07c4c883bfa5c7f2ddd8cf2aafa86fa981b2d2", size = 500258, upload-time = "2026-03-31T21:57:39.923Z" },
@@ -303,6 +359,7 @@ name = "anyio"
 version = "4.9.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
     { name = "idna" },
     { name = "sniffio" },
     { name = "typing-extensions", marker = "python_full_version < '3.13'" },
@@ -314,36 +371,78 @@ wheels = [
 
 [[package]]
 name = "apache-tvm-ffi"
-version = "0.1.10"
+version = "0.1.9"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/17/b0/5114e30faffe3279a51a5f3b45dd1b7ce09af1246b62447b45a39a374e54/apache_tvm_ffi-0.1.10.tar.gz", hash = "sha256:974c208766c304c780c17c6d405449e862f83b22c7b6b2b8c28b29d55a806ae3", size = 2691605, upload-time = "2026-04-07T19:58:51.767Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1a/12/0ba672dba52f9ecc813ce7ff4ef4aa5a2c5f27243d26165f09053f057a76/apache_tvm_ffi-0.1.10-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:52ed8fec82451c3af1e205f55500e5adc5eaa1913c82ce15b2064d305d7f880b", size = 2285850, upload-time = "2026-04-07T19:58:12.784Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/2a/1978a1c827e1212de4f369ec08cfeb44719bbe6cbeab90b15e967c68c108/apache_tvm_ffi-0.1.10-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ec5c4a81e294e6379e4dea68c86266924d3f22829c3de272806c980238e43e59", size = 2476596, upload-time = "2026-04-07T19:58:14.316Z" },
-    { url = "https://files.pythonhosted.org/packages/50/6f/23740f06829030704e6f8f1f7093a06b7a68f904baa40053a5f594705bae/apache_tvm_ffi-0.1.10-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:73d478395a8625dd92fde7b7fd92b4719f18f480b78336e422cb66cc7985213d", size = 2589574, upload-time = "2026-04-07T19:58:15.94Z" },
-    { url = "https://files.pythonhosted.org/packages/92/d0/54badf5c8f6208e06f331a20ddd154f19c94c2e906da5b8cce7d60727d4b/apache_tvm_ffi-0.1.10-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3829216a8500c2f61062e48c627f6db6c3fa49416b3ffa85bc04243ae5d759f7", size = 2396434, upload-time = "2026-04-07T19:58:17.519Z" },
-    { url = "https://files.pythonhosted.org/packages/51/f7/ca3fdadc2468e8b67a2f3f13bb7aa132c584feefd8a25dbf920e4bf0a03b/apache_tvm_ffi-0.1.10-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:96b69030c722572e13e30182733adfa2d604258e988b3f6630a16f397c7f9288", size = 2571084, upload-time = "2026-04-07T19:58:20.399Z" },
-    { url = "https://files.pythonhosted.org/packages/23/2d/bf899e1ba4ea1da6a55a04ad3e9c07338ee06a140862b05310bae9a00cf9/apache_tvm_ffi-0.1.10-cp312-abi3-win_amd64.whl", hash = "sha256:14e59f6f69881d37a25b03943cfac33317a06f6745df0ff2dfb3b0cd3ed3698f", size = 2261853, upload-time = "2026-04-07T19:58:21.772Z" },
-    { url = "https://files.pythonhosted.org/packages/e6/ec/305fe5cc45d41a24d8d7236b886cacc2d6dd3c29eab68dc5cec06a9fd22c/apache_tvm_ffi-0.1.10-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:40c7caddf7b73cabf06f814e8d1bdef0f9bd5676bf7563546dd61f14df9e656d", size = 2344135, upload-time = "2026-04-07T19:58:23.512Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/5d/b1661512164772fc9ef1642234bf117182b440fc0a0b2ca8bd829fe7b40e/apache_tvm_ffi-0.1.10-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:32b9f4a44c09fcdd0994ee3c4415bf0371d68ea35a46da94ddcc666c9a6cf677", size = 2508518, upload-time = "2026-04-07T19:58:25.3Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/57/7266807b34344b9d8e4d776ebff38fd25f93a73e8c24bc595a67b6b69b3c/apache_tvm_ffi-0.1.10-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c9b93dc7fdc99d4cc44e9ac95063073b4fb8ced94929197ea3d631b70f554d8a", size = 2617108, upload-time = "2026-04-07T19:58:26.888Z" },
-    { url = "https://files.pythonhosted.org/packages/96/c3/a152ed68f57a491baaf70819224b98643309c7488fdcbc6fa3c84ebb9ca8/apache_tvm_ffi-0.1.10-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:74724db54dfb825951e2deb3d2024b2c1867bff456db81512e475f9ccdd9b86b", size = 2432434, upload-time = "2026-04-07T19:58:28.681Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/09/5e2877c635edc8ac83caa106a6e78bd4816cbc2e52e1daea652c1fe956cf/apache_tvm_ffi-0.1.10-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ac03c04145d9c248992e6f2ec2392a6914966a416eeeeaa729393f40b047be42", size = 2602517, upload-time = "2026-04-07T19:58:30.35Z" },
-    { url = "https://files.pythonhosted.org/packages/81/50/900d55d8c3ca5a3fcdcef3a6d999f316d01f9e45e5297c444a2940eff5d2/apache_tvm_ffi-0.1.10-cp314-cp314t-win_amd64.whl", hash = "sha256:25d9130788f9b4563330122503b21e6c0ed37198f1552df36c1561b3704f1b2f", size = 2370990, upload-time = "2026-04-07T19:58:31.855Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/6f/60/1e787a0b5ebf318483235be2a689ee367173983067e441b8379564f667c0/apache_tvm_ffi-0.1.9.tar.gz", hash = "sha256:d2d402587e8906de0a07f4746aa78f3d452c7efe3625d4bb39ac2ad693bce530", size = 2513731, upload-time = "2026-02-27T19:28:06.602Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/56/3d/4594c14de64e92697a91eec8ac6518ad72a3f30776aff432e68c2c6d9d3d/apache_tvm_ffi-0.1.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d911cbbc83bf12a0d9ec03e5315ff1bb92d95702fe912cd7a050393274382e71", size = 2068752, upload-time = "2026-02-27T19:27:03.001Z" },
+    { url = "https://files.pythonhosted.org/packages/83/0a/827e4f9ae85e1be3037818abd59566d906ba1fe27295c6938b12cc482151/apache_tvm_ffi-0.1.9-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:1c8dd4018420c0d14bace688594710909ce198056ff8ac2ad1cd462b30fe1bdd", size = 2231204, upload-time = "2026-02-27T19:27:04.734Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/b6/f1ec5c528918c4dae03885ec472663072a984431d7d7fb04ca0798a2e13c/apache_tvm_ffi-0.1.9-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7f6bc8846d570b8ce38692fc91b530b44cd6ae092c805a844da23970e81b12c0", size = 2323684, upload-time = "2026-02-27T19:27:06.284Z" },
+    { url = "https://files.pythonhosted.org/packages/28/08/818836fbc0f198da1597896f82d7e6556bf5678cd5150d633214bf14b718/apache_tvm_ffi-0.1.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f3ec9149f207a7af3ea3531cad7a0b0d04ded06df4f51a547479d5eb489428dd", size = 2160066, upload-time = "2026-02-27T19:27:07.897Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/6b/2e7d73d055523c2fb31394cd3d55593969a0680619e1c939c2128c2fdd36/apache_tvm_ffi-0.1.9-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:eefcd17f61bf503ff0f4ad429e03ef6c241c7d13682f58281d883218b854c9bd", size = 2307014, upload-time = "2026-02-27T19:27:10.287Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/9d/9b99efbeaaed4c78a2b7cfeda6b8fc7d6249616938c05ae0248aa0bf0d56/apache_tvm_ffi-0.1.9-cp310-cp310-win_amd64.whl", hash = "sha256:dd58da01331826fbe6c064d6f0c9bbc2d62883b78df8d15baa8ea21d37507e4d", size = 1993158, upload-time = "2026-02-27T19:27:11.884Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/44/130571cede8704b1412e48b3dd78de41b4d31b68241f954743d1a9925bd9/apache_tvm_ffi-0.1.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:932d94e29595a47109f0ef6e0b4209a934451582954ea8b426e758d6b3e307e3", size = 2070368, upload-time = "2026-02-27T19:27:13.779Z" },
+    { url = "https://files.pythonhosted.org/packages/42/b1/9f2cfd6d49b03c5d4ec5c12548d911e2e01265be783f343103b4df716765/apache_tvm_ffi-0.1.9-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c0449fc3802987c3652bea266ffda2934a6f69c80bba791a3f55b91040656a18", size = 2231154, upload-time = "2026-02-27T19:27:15.691Z" },
+    { url = "https://files.pythonhosted.org/packages/55/43/63faedea83494e99122466a993bcdccd31cf93c7e8a0d56731120e82e2b9/apache_tvm_ffi-0.1.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6f16d73a82a9e68a439b7d233d48b1b929be17fe92df4bbf1ee2274e573144a3", size = 2323130, upload-time = "2026-02-27T19:27:17.259Z" },
+    { url = "https://files.pythonhosted.org/packages/27/96/d735bc4c528efaf0a8a954076963c727aad2dde8577641aa9025ec4f2d52/apache_tvm_ffi-0.1.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:01ebb1308b2666c206aa9a4015eb48f03a5d98ea2e9cfb002bd5e2ca0b9c7ef3", size = 2159854, upload-time = "2026-02-27T19:27:18.789Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/3b/6cfc82a3ab5d9e501bbcee5df36eebe09da1c384461d7a55e2a17776d117/apache_tvm_ffi-0.1.9-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21365abd2a2a1a6d3b4e6e4f048309651125becfa795440c3607f3cc27d30ac7", size = 2307140, upload-time = "2026-02-27T19:27:20.222Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/61/3ffe1fe3190e12807a12b72ed0d291c7f66569c2e7c3571fde18175f19e1/apache_tvm_ffi-0.1.9-cp311-cp311-win_amd64.whl", hash = "sha256:9ee710a9fba3d9ff9747870bbd7e2175eb8d5b9c791f17fd645f35f6dab3f8aa", size = 1993218, upload-time = "2026-02-27T19:27:22.043Z" },
+    { url = "https://files.pythonhosted.org/packages/df/f2/b8c4b151169f6d7ba8773c8af68b2e0c1013d7fb3f1bdf87573f47157ce9/apache_tvm_ffi-0.1.9-cp312-abi3-macosx_11_0_arm64.whl", hash = "sha256:49e52350b0470654847de752e65603b604a4d3323e7e9f5e8a982f44acc4c143", size = 2041756, upload-time = "2026-02-27T19:27:23.931Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/c0/6d3d54f50012255b41bc3e24944c086f63c4707c8686c7c6780e9283eb96/apache_tvm_ffi-0.1.9-cp312-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7d503029e66c43b1a1cb1a42a1e9bb428c8a28dcbdec31c28e705472ca648a3a", size = 2203712, upload-time = "2026-02-27T19:27:25.867Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/dd/2bab4c6cd86257dbf99e93452a1af833113f8dc3e25a25579f6e4e4c8a94/apache_tvm_ffi-0.1.9-cp312-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28241371934ea8af10d5067087ba1229ebddded7b2c02d33a258ec2a96df8c46", size = 2299704, upload-time = "2026-02-27T19:27:27.477Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/4a/b469bcb2e1014cb84d336d2a59f42958a058251c577a4c2680cacad346e2/apache_tvm_ffi-0.1.9-cp312-abi3-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:87cacce81df55685fc6a76e1e3c5db1200e85e87bf5974b692c59d131b7bc622", size = 2130865, upload-time = "2026-02-27T19:27:29.092Z" },
+    { url = "https://files.pythonhosted.org/packages/70/ef/5402da5d37f5270fd88ea0348acca78dba9be8bdbf6c2bcae0935eb03ef1/apache_tvm_ffi-0.1.9-cp312-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f45eb43499acac45ff6c93564f0ff2d3ca27b69656d540fd56ce59d51c0b4c65", size = 2278991, upload-time = "2026-02-27T19:27:30.729Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/23/1b7dc5f0807f83098183a57db6ee85b2c93b646d74a6e03781c9208aaeb0/apache_tvm_ffi-0.1.9-cp312-abi3-win_amd64.whl", hash = "sha256:d1dcf4c041d5ec05e3da1d545800c33cdbb95c113baa7705085ff79fa262752b", size = 1973200, upload-time = "2026-02-27T19:27:32.367Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/1e/991ae65e64ce132c1ba665562db6638f5696d6133f580e20c653de33b9af/apache_tvm_ffi-0.1.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c3349f72ddb8ce206472d0380a729f213017a2180707096f8d57114b81097dd1", size = 2072944, upload-time = "2026-02-27T19:27:34.261Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/a7/1e0643949e683fb3cfababd87058c0cfef122d1a3bb6ce703f719051b842/apache_tvm_ffi-0.1.9-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d1f4d2b7ec7b1213632e9a104e9330bfc3dec48decffa62114c33aa188c9f43a", size = 2215954, upload-time = "2026-02-27T19:27:35.872Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/06/5016191ab61d2db4c3a7d754a3c1184e0836f575a7d08491669738c5e4b9/apache_tvm_ffi-0.1.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e4f01d16ba53fe118e363f7257253f07003797e4abe6fc9567f23b6a930dbff2", size = 2307291, upload-time = "2026-02-27T19:27:37.527Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/f5/40bf0667330938efbfc0a51743cc53c79e41b4ece1a8abad3076192c9674/apache_tvm_ffi-0.1.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3c0581dd6bfbce7b017ef85cfda08bbe38891cc4b3afbcfaa8bc2d383728e426", size = 2143850, upload-time = "2026-02-27T19:27:40.437Z" },
+    { url = "https://files.pythonhosted.org/packages/72/4a/421cbd4ed32e8bad3b88af3e8fa145c1f6f493bdd05be15b6f2d9b3cb7d6/apache_tvm_ffi-0.1.9-cp314-cp314t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7dfa14be2a49347791ef21222a8225ce7f99bfec17104a676cb4f1bf3a107088", size = 2289038, upload-time = "2026-02-27T19:27:41.972Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/1a/c8923d819b49872a612033b90d29299c0be73a7cbed1ddb3dc78dfe5e9f1/apache_tvm_ffi-0.1.9-cp314-cp314t-win_amd64.whl", hash = "sha256:a42d7ca27dce83efbdce7ec970fe3e773a69c31d928730ee5d9badb1229d106c", size = 2039007, upload-time = "2026-02-27T19:27:43.618Z" },
 ]
 
 [[package]]
 name = "astroid"
 version = "3.2.4"
 source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
 sdist = { url = "https://files.pythonhosted.org/packages/9e/53/1067e1113ecaf58312357f2cd93063674924119d80d173adc3f6f2387aa2/astroid-3.2.4.tar.gz", hash = "sha256:0e14202810b30da1b735827f78f5157be2bbd4a7a59b7707ca0bfc2fb4c0063a", size = 397576, upload-time = "2024-07-20T12:57:43.26Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/80/96/b32bbbb46170a1c8b8b1f28c794202e25cfe743565e9d3469b8eb1e0cc05/astroid-3.2.4-py3-none-any.whl", hash = "sha256:413658a61eeca6202a59231abb473f932038fbcbf1666587f66d482083413a25", size = 276348, upload-time = "2024-07-20T12:57:40.886Z" },
 ]
 
+[[package]]
+name = "async-timeout"
+version = "4.0.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/87/d6/21b30a550dafea84b1b8eee21b5e23fa16d010ae006011221f33dcd8d7f8/async-timeout-4.0.3.tar.gz", hash = "sha256:4640d96be84d82d02ed59ea2b7105a0f7b33abe8703703cd0ab0bf87c427522f", size = 8345, upload-time = "2023-08-10T16:35:56.907Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/fa/e01228c2938de91d47b307831c62ab9e4001e747789d0b05baf779a6488c/async_timeout-4.0.3-py3-none-any.whl", hash = "sha256:7405140ff1230c310e51dc27b3145b9092d659ce68ff733fb0cefe3ee42be028", size = 5721, upload-time = "2023-08-10T16:35:55.203Z" },
+]
+
+[[package]]
+name = "async-timeout"
+version = "5.0.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a5/ae/136395dfbfe00dfc94da3f3e136d0b13f394cba8f4841120e34226265780/async_timeout-5.0.1.tar.gz", hash = "sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3", size = 9274, upload-time = "2024-11-06T16:41:39.6Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/ba/e2081de779ca30d473f21f5b30e0e737c438205440784c7dfc81efc2b029/async_timeout-5.0.1-py3-none-any.whl", hash = "sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c", size = 6233, upload-time = "2024-11-06T16:41:37.9Z" },
+]
+
 [[package]]
 name = "attrs"
 version = "26.1.0"
@@ -355,26 +454,33 @@ wheels = [
 
 [[package]]
 name = "av"
-version = "17.0.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/4e/f0/8c8dca97ae0cf00e8e2a53bb5cb9aca5fd484f585ef3e9b412200aff3ebd/av-17.0.1.tar.gz", hash = "sha256:fbcbd4aa43bca6a8691816283112d1659a27f407bbeb66d1397023691339f5d4", size = 4411938, upload-time = "2026-04-18T17:12:34.29Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/4c/82/e7007dcef7bd2d2c377e2e85977701384f42d19fc808c2ccb3a99eaf58f2/av-17.0.1-cp311-abi3-macosx_11_0_x86_64.whl", hash = "sha256:987f4f46ceae4da6c614dcbd2b8149be9dbf680c3bb7a6841c58af9cff4d9230", size = 23238802, upload-time = "2026-04-18T17:11:51.166Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/aa/858b09a08ea6f83f91be44b5a5adad13ae8d9ac8b80fda27e73c24bfb160/av-17.0.1-cp311-abi3-macosx_14_0_arm64.whl", hash = "sha256:d97f54e55b18a74912f479c1978aadd1341d38d892dee95bb5c2f2dccfa72f32", size = 18709338, upload-time = "2026-04-18T17:11:53.286Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/8b/8de3fd21c4b0b74d44337421abeab0e71462337fb6a28fff888e0c356cbd/av-17.0.1-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:e6eee84afa48d0e9321047cd3e4facd44b401493f6bdc753e2e1d1e7c9e6d13e", size = 34007351, upload-time = "2026-04-18T17:11:56.116Z" },
-    { url = "https://files.pythonhosted.org/packages/02/28/167b291356c2cc315a2d62a95b0ceace72b5b0bf547de30b89313110f032/av-17.0.1-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:c58c71bffd9383908c85695ac61d3184c668accb04a5bd1b262e0fb8d09f60a5", size = 36345295, upload-time = "2026-04-18T17:11:59.125Z" },
-    { url = "https://files.pythonhosted.org/packages/04/fa/aae56f2ff2c204c408641e1120f5ca5ce9c3390cf5362245c6f1158704b5/av-17.0.1-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:42d6745d30a410ec9b22aef79a52a7ab5a001eb8f5adfd952946606a30983318", size = 35183754, upload-time = "2026-04-18T17:12:01.697Z" },
-    { url = "https://files.pythonhosted.org/packages/ba/bd/776046f27093aef80155a204ca7d82a887ae4ee72ba4ef8411b46ea7898c/av-17.0.1-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:3ed6bcd7021fe55832f95b8ef78dd01a4cb21faf3cd71f1e1bf4f20bf100b278", size = 37430809, upload-time = "2026-04-18T17:12:04.231Z" },
-    { url = "https://files.pythonhosted.org/packages/d9/d5/3261bd2c6b7f6c0aa8379fc970d1ecf496330990b992ad28607785074268/av-17.0.1-cp311-abi3-win_amd64.whl", hash = "sha256:9af524e8632a54032e361d6b88895bd3e7c6212ca560de60f5ccc525323c764c", size = 28889649, upload-time = "2026-04-18T17:12:07.04Z" },
-    { url = "https://files.pythonhosted.org/packages/98/39/381104e427a0c7231d2ec0d25d538d58fc20fc0458846b95860d3ef8073b/av-17.0.1-cp311-abi3-win_arm64.whl", hash = "sha256:50e58a473d65ea29b645e45c9fd8518a6783737135683ecc40571a91592bdfe4", size = 21918412, upload-time = "2026-04-18T17:12:09.312Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/8c/bb1498f031abb6157b30b7fc2379359176953821b6ba59fbd89dbb56f61f/av-17.0.1-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:1d33871742d1e71562db3c8e752cacc5a62766d7efc3ae408bff1c3e26ebb46e", size = 23484157, upload-time = "2026-04-18T17:12:11.67Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/58/dedaef187b797243cd5762722e376c69c5ad95ab23db44127f09afc2cd66/av-17.0.1-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:1229e879f4b6431bc00f69d7f8891fe9a683b0a6e0e009e6c98eb7e449f0383d", size = 18920872, upload-time = "2026-04-18T17:12:14.826Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/26/5c550231651d6285e6a5c4f6f4a0e67459bfe2b622a7c9352be8cca8c819/av-17.0.1-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:4744837f4116964280bcc72285e3cdd51361e98a696205aadd924203440ef511", size = 37471077, upload-time = "2026-04-18T17:12:17.349Z" },
-    { url = "https://files.pythonhosted.org/packages/59/e4/9807b89a9d775c6f015677996c48bce48aaff70b5d95885adf39e59832a2/av-17.0.1-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:3d0a7d45d9599bf9df9f8249827113d4f36df1cd6b5356227b997f0552dbc98e", size = 39566981, upload-time = "2026-04-18T17:12:19.942Z" },
-    { url = "https://files.pythonhosted.org/packages/5c/72/a22a657abc3de652f5b4f46cbbebdf7cba629752112791b81f05d340991d/av-17.0.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9acd0b6a6e02af2b37f63d97a03ee2c47936d58e82425c3cd075a95245937c59", size = 38397369, upload-time = "2026-04-18T17:12:22.909Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/b2/f4e83e41c1e3c186f34b7df506779d0cd7e40499e2e19519c7ece148cd20/av-17.0.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3d3a36204cb1f1e7691e6446afa8d6b7097b09946dae732c71c5d05ce09e506e", size = 40582445, upload-time = "2026-04-18T17:12:26.285Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/59/8676188b72eed09d48ce6cfaf0f22b0bb9f3cfd74d388ee2b7fdf960536d/av-17.0.1-cp314-cp314t-win_amd64.whl", hash = "sha256:b87b98afe971cde123953073bc9c95ab0b7efd2ecc082dd2dbd11f9d9abf190e", size = 29217136, upload-time = "2026-04-18T17:12:29.189Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/af/0a6e1d2a845988039f6c197fa7269b5e9abbe17354fb41cc9d75bb260fcb/av-17.0.1-cp314-cp314t-win_arm64.whl", hash = "sha256:a87a42c36e29f75e7dff7281944f2a6876a2c8875e225ccbf6c1ae62748b4caa", size = 22072676, upload-time = "2026-04-18T17:12:31.836Z" },
+version = "17.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/eb/abca886df3a091bc406feb5ff71b4c4f426beaae6b71b9697264ce8c7211/av-17.0.0.tar.gz", hash = "sha256:c53685df73775a8763c375c7b2d62a6cb149d992a26a4b098204da42ade8c3df", size = 4410769, upload-time = "2026-03-14T14:38:45.868Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/95/4d/ea1ac272eeea83014daca1783679a9e9f894e1e68e5eb4f717dd8813da2a/av-17.0.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:4b21bcff4144acae658c0efb011fa8668c7a9638384f3ae7f5add33f35b907c6", size = 23407827, upload-time = "2026-03-14T14:37:47.337Z" },
+    { url = "https://files.pythonhosted.org/packages/54/1a/e433766470c57c9c1c8558021de4d2466b3403ed629e48722d39d12baa6c/av-17.0.0-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:17cd518fc88dc449ce9dcfd0b40e9b3530266927375a743efc80d510adfb188b", size = 18829899, upload-time = "2026-03-14T14:37:50.493Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/25/95ad714f950c188495ffbfef235d06a332123d6f266026a534801ffc2171/av-17.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:9a8b7b63a92d8dc7cbe5000546e4684176124ddd49fdd9c12570e3aa6dadf11a", size = 35348062, upload-time = "2026-03-14T14:37:52.964Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/db/7f3f9e92f2ac8dba639ab01d69a33b723aa16b5e3e612dbfe667fbc02dcd/av-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:8706ce9b5d8d087d093b46a9781e7532c4a9e13874bca1da468be78efc56cecc", size = 37684503, upload-time = "2026-03-14T14:37:55.628Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/53/3b356b14ba72354688c8d9777cf67b707769b6e14b63aaeb0cddeeac8d32/av-17.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3a074835ce807434451086993fedfb3b223dacedb2119ab9d7a72480f2d77f32", size = 36547601, upload-time = "2026-03-14T14:37:58.465Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/8d/f489cd6f9fe9c8b38dca00ecb39dc38836761767a4ec07dd95e62e124ac3/av-17.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f8ef8e8f1a0cbb2e0ad49266015e2277801a916e2186ac9451b493ff6dfdec27", size = 38815129, upload-time = "2026-03-14T14:38:01.277Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/bd/e42536234e37caffd1a054de1a0e6abca226c5686e9672726a8d95511422/av-17.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:a795e153ff31a6430e974b4e6ad0d0fab695b78e3f17812293a0a34cd03ee6a9", size = 28984602, upload-time = "2026-03-14T14:38:03.632Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/fb/55e3b5b5d1fc61466292f26fbcbabafa2642f378dc48875f8f554591e1a4/av-17.0.0-cp311-abi3-macosx_11_0_x86_64.whl", hash = "sha256:ed4013fac77c309a4a68141dcf6148f1821bb1073a36d4289379762a6372f711", size = 23238424, upload-time = "2026-03-14T14:38:05.856Z" },
+    { url = "https://files.pythonhosted.org/packages/52/03/9ace1acc08bc9ae38c14bf3a4b1360e995e4d999d1d33c2cbd7c9e77582a/av-17.0.0-cp311-abi3-macosx_14_0_arm64.whl", hash = "sha256:e44b6c83e9f3be9f79ee87d0b77a27cea9a9cd67bd630362c86b7e56a748dfbb", size = 18709043, upload-time = "2026-03-14T14:38:08.288Z" },
+    { url = "https://files.pythonhosted.org/packages/00/c0/637721f3cd5bb8bd16105a1a08efd781fc12f449931bdb3a4d0cfd63fa55/av-17.0.0-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:b440da6ac47da0629d509316f24bcd858f33158dbdd0f1b7293d71e99beb26de", size = 34018780, upload-time = "2026-03-14T14:38:10.45Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/59/d19bc3257dd985d55337d7f0414c019414b97e16cd3690ebf9941a847543/av-17.0.0-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1060cba85f97f4a337311169d92c0b5e143452cfa5ca0e65fa499d7955e8592e", size = 36358757, upload-time = "2026-03-14T14:38:13.092Z" },
+    { url = "https://files.pythonhosted.org/packages/52/6c/a1f4f2677bae6f2ade7a8a18e90ebdcf70690c9b1c4e40e118aa30fa313f/av-17.0.0-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:deda202e6021cfc7ba3e816897760ec5431309d59a4da1f75df3c0e9413d71e7", size = 35195281, upload-time = "2026-03-14T14:38:15.789Z" },
+    { url = "https://files.pythonhosted.org/packages/90/ea/52b0fc6f69432c7bf3f5fbe6f707113650aa40a1a05b9096ffc2bba4f77d/av-17.0.0-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:ffaf266a1a9c2148072de0a4b5ae98061465178d2cfaa69ee089761149342974", size = 37444817, upload-time = "2026-03-14T14:38:18.563Z" },
+    { url = "https://files.pythonhosted.org/packages/34/ad/d2172966282cb8f146c13b6be7416efefde74186460c5e1708ddfc13dba6/av-17.0.0-cp311-abi3-win_amd64.whl", hash = "sha256:45a35a40b2875bf2f98de7c952d74d960f92f319734e6d28e03b4c62a49e6f49", size = 28888553, upload-time = "2026-03-14T14:38:21.223Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/bb/c5a4c4172c514d631fb506e6366b503576b8c7f29809cf42aca73e28ff01/av-17.0.0-cp311-abi3-win_arm64.whl", hash = "sha256:3d32e9b5c5bbcb872a0b6917b352a1db8a42142237826c9b49a36d5dbd9e9c26", size = 21916910, upload-time = "2026-03-14T14:38:23.706Z" },
+    { url = "https://files.pythonhosted.org/packages/7f/8e/c40ac08e63f79387c59f6ecc38f47d4c942b549130eee579ec1a91f6a291/av-17.0.0-cp314-cp314t-macosx_11_0_x86_64.whl", hash = "sha256:d13250fb4b4522e9a6bec32da082556d5f257110ea223758151375748d9bbe25", size = 23483029, upload-time = "2026-03-14T14:38:25.758Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/fb/b4419494bfc249163ec393c613966d66db7e95c76da3345711cd115a79df/av-17.0.0-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:dbb56aa3b7ae72451d1bf6e9d37c7d83d39b97af712f73583ff419fbf08fc237", size = 18920446, upload-time = "2026-03-14T14:38:27.905Z" },
+    { url = "https://files.pythonhosted.org/packages/30/62/c2306d91602ddad2c56106f21dcb334fd51d5ea2e952f7fa025bb8aa39fc/av-17.0.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:a213ac9e83b7ab12c2e9f277a09cac8e9d85cf0883efdab7a87a60e2e4e48879", size = 37477266, upload-time = "2026-03-14T14:38:30.404Z" },
+    { url = "https://files.pythonhosted.org/packages/28/cd/c8510a9607886785c0b3ca019d503e888c3757529be42a7287fe2bfa92d5/av-17.0.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:e15c88bb0921f9435bcc5a27a0863dba571a80ad5e1389c4fcf2073833bb4a74", size = 39572988, upload-time = "2026-03-14T14:38:32.984Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/2d/207d9361e25b5abec9be335bbab4df6b6b838e2214be4b374f4cfb285427/av-17.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:096cfd1e9fc896506726c7c42aaf9b370e78c2f257cde4d6ddb6c889bfcc49ec", size = 38399591, upload-time = "2026-03-14T14:38:35.465Z" },
+    { url = "https://files.pythonhosted.org/packages/73/ca/307740c6aa2980966bf11383ffcb04bacc5b13f3d268ab4cfb274ad6f793/av-17.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3649ab3d2c7f58049ded1a36e100c0d8fd529cf258f41dd88678ba824034d8c9", size = 40590681, upload-time = "2026-03-14T14:38:38.269Z" },
+    { url = "https://files.pythonhosted.org/packages/35/f2/6fdb26d0651adf409864cb2a0d60da107e467d3d1aabc94b234ead54324a/av-17.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:e5002271ab2135b551d980c2db8f3299d452e3b9d3633f24f6bb57fffe91cd10", size = 29216337, upload-time = "2026-03-14T14:38:40.83Z" },
+    { url = "https://files.pythonhosted.org/packages/41/0a/0896b829a39b5669a2d811e1a79598de661693685cd62b31f11d0c18e65b/av-17.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dba98603fc4665b4f750de86fbaf6c0cfaece970671a9b529e0e3d1711e8367e", size = 22071058, upload-time = "2026-03-14T14:38:43.663Z" },
 ]
 
 [[package]]
@@ -386,6 +492,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/77/f5/21d2de20e8b8b0408f0681956ca2c69f1320a3848ac50e6e7f39c6159675/babel-2.18.0-py3-none-any.whl", hash = "sha256:e2b422b277c2b9a9630c1d7903c2a00d0830c409c59ac8cae9081c92f1aeba35", size = 10196845, upload-time = "2026-02-01T12:30:53.445Z" },
 ]
 
+[[package]]
+name = "backports-asyncio-runner"
+version = "1.2.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/8e/ff/70dca7d7cb1cbc0edb2c6cc0c38b65cba36cccc491eca64cabd5fe7f8670/backports_asyncio_runner-1.2.0.tar.gz", hash = "sha256:a5aa7b2b7d8f8bfcaa2b57313f70792df84e32a2a746f585213373f900b42162", size = 69893, upload-time = "2025-07-02T02:27:15.685Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/a0/59/76ab57e3fe74484f48a53f8e337171b4a2349e506eabe136d7e01d059086/backports_asyncio_runner-1.2.0-py3-none-any.whl", hash = "sha256:0da0a936a8aeb554eccb426dc55af3ba63bcdc69fa1a600b5bb305413a4477b5", size = 12313, upload-time = "2025-07-02T02:27:14.263Z" },
+]
+
 [[package]]
 name = "bcrypt"
 version = "5.0.0"
@@ -450,6 +565,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c0/f6/688d2cd64bfd0b14d805ddb8a565e11ca1fb0fd6817175d58b10052b6d88/bcrypt-5.0.0-cp39-abi3-win32.whl", hash = "sha256:64d7ce196203e468c457c37ec22390f1a61c85c6f0b8160fd752940ccfb3a683", size = 153725, upload-time = "2025-09-25T19:50:34.384Z" },
     { url = "https://files.pythonhosted.org/packages/9f/b9/9d9a641194a730bda138b3dfe53f584d61c58cd5230e37566e83ec2ffa0d/bcrypt-5.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:64ee8434b0da054d830fa8e89e1c8bf30061d539044a39524ff7dec90481e5c2", size = 150912, upload-time = "2025-09-25T19:50:35.69Z" },
     { url = "https://files.pythonhosted.org/packages/27/44/d2ef5e87509158ad2187f4dd0852df80695bb1ee0cfe0a684727b01a69e0/bcrypt-5.0.0-cp39-abi3-win_arm64.whl", hash = "sha256:f2347d3534e76bf50bca5500989d6c1d05ed64b440408057a37673282c654927", size = 144953, upload-time = "2025-09-25T19:50:37.32Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/75/4aa9f5a4d40d762892066ba1046000b329c7cd58e888a6db878019b282dc/bcrypt-5.0.0-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7edda91d5ab52b15636d9c30da87d2cc84f426c72b9dba7a9b4fe142ba11f534", size = 271180, upload-time = "2025-09-25T19:50:38.575Z" },
+    { url = "https://files.pythonhosted.org/packages/54/79/875f9558179573d40a9cc743038ac2bf67dfb79cecb1e8b5d70e88c94c3d/bcrypt-5.0.0-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:046ad6db88edb3c5ece4369af997938fb1c19d6a699b9c1b27b0db432faae4c4", size = 273791, upload-time = "2025-09-25T19:50:39.913Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/fe/975adb8c216174bf70fc17535f75e85ac06ed5252ea077be10d9cff5ce24/bcrypt-5.0.0-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:dcd58e2b3a908b5ecc9b9df2f0085592506ac2d5110786018ee5e160f28e0911", size = 270746, upload-time = "2025-09-25T19:50:43.306Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/f8/972c96f5a2b6c4b3deca57009d93e946bbdbe2241dca9806d502f29dd3ee/bcrypt-5.0.0-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:6b8f520b61e8781efee73cba14e3e8c9556ccfb375623f4f97429544734545b4", size = 273375, upload-time = "2025-09-25T19:50:45.43Z" },
 ]
 
 [[package]]
@@ -471,6 +590,32 @@ version = "3.8.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/fc/47/b5da717e7bbe97a6dc4c986f053ca55fd3276078d78f68f9e8b417d1425a/bitarray-3.8.1.tar.gz", hash = "sha256:f90bb3c680804ec9630bcf8c0965e54b4de84d33b17d7da57c87c30f0c64c6f5", size = 152471, upload-time = "2026-04-02T16:29:01.712Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/ff/fc/4352b1dd55a50c85f7b502c011d40279a66a05eb0c6a5d3d44160838d9a4/bitarray-3.8.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:30d42c34da2974a5e2e0b51c57ecf89892c1e83ed67e1084d1e27eefc27add91", size = 149074, upload-time = "2026-04-02T16:26:16.319Z" },
+    { url = "https://files.pythonhosted.org/packages/34/06/104c9ff50e5230f6581056d6f4b0d1e0db14aba41549cae4b0541be0369c/bitarray-3.8.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:0793c51d3b1c7410bde1f7254fff71fabff1bc0cdeba1fa51319ac4e7931df3d", size = 146031, upload-time = "2026-04-02T16:26:18.453Z" },
+    { url = "https://files.pythonhosted.org/packages/65/0b/99d65fa6ceb3616c4b96ab9fef2dcd4994ad05fa48f595706ba001f13ba7/bitarray-3.8.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:133648c3405564e7fef9103f1768cb018de1b4976f3d8beff09cd4acea73bfe4", size = 325129, upload-time = "2026-04-02T16:26:19.77Z" },
+    { url = "https://files.pythonhosted.org/packages/18/d4/e0913c6b15fbd1e6b4d60a541a6784eb5d8f1fddcbcbb8c076240f665f2e/bitarray-3.8.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c4fd3399eaf6f1c77ea3132611efbc3d7a8c0eb899793387b3266be221dc75fd", size = 353126, upload-time = "2026-04-02T16:26:21.274Z" },
+    { url = "https://files.pythonhosted.org/packages/49/1b/0fd86dece4eca8078a99e54ca01183d5b660195dea8f2c8ad5740b190e9f/bitarray-3.8.1-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3b9790ae107fc8648155f120e80a58ef8e94424efefff5b355de84061de6a18b", size = 363588, upload-time = "2026-04-02T16:26:22.638Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/2d/e62ddf9e52a0124a19f2cd83be5dfa256c6c1f20722fb0bb4b0aed51bb0b/bitarray-3.8.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:af01133e78e5528ee282ceb1cf4bc54aecb937c2001913e751452ad7dffbbeb1", size = 331725, upload-time = "2026-04-02T16:26:24.296Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/fc/ea1c532169d56747c128f4e0256a3ad1f0c91ed00ca83cdf93964a60fec3/bitarray-3.8.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2da2ca9495668ab77132a911f6bd530d2bfe686d10467584894efc3b66e9ffb5", size = 322939, upload-time = "2026-04-02T16:26:25.904Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/01/8fe68993779f077c713fc4c21c0d9ba2719beeea596bcdc37f9660b6f181/bitarray-3.8.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:72a0e87b2196120523fc6194ca6b580fcffa12d7daa4d57a16d7838e60f82d0e", size = 351084, upload-time = "2026-04-02T16:26:27.396Z" },
+    { url = "https://files.pythonhosted.org/packages/96/c5/f5cb62b60e0da428ef9457e7e1d9a3d3d8874b4f0f925adfff4b9ab3a319/bitarray-3.8.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:defa3c12cb06b2fd2066a9e21bf00aab96465be84d9585c8c05195f080510506", size = 347489, upload-time = "2026-04-02T16:26:28.935Z" },
+    { url = "https://files.pythonhosted.org/packages/88/71/bb9baadbdd305f80def4220ce38266f53404433661492fc2c3d894129bfb/bitarray-3.8.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7eae9e763fbd32f19f2a66dfc2e37906f8422e0c4ad4a6c9dcf9d3246740812e", size = 328394, upload-time = "2026-04-02T16:26:30.585Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/1a/c4d487b029488bebef38a4c04df34294d27f313b5ab3491aa3a051394f24/bitarray-3.8.1-cp310-cp310-win32.whl", hash = "sha256:3b9358f6437a5fa0c765ffae5810c9830547baf4bcf469438b82845c3f33f998", size = 143245, upload-time = "2026-04-02T16:26:32.414Z" },
+    { url = "https://files.pythonhosted.org/packages/98/ae/adadedf7cdd49fb8d81b8013d3471c193d6208035a0748205c808b1709cd/bitarray-3.8.1-cp310-cp310-win_amd64.whl", hash = "sha256:6f92d12a46b2a67d56194bb5d226dabf586b386d1f1a5e25be5b745a3080dbba", size = 149976, upload-time = "2026-04-02T16:26:33.868Z" },
+    { url = "https://files.pythonhosted.org/packages/26/de/2a7a8c2868d85e671a6cdb5282bbb299d98cdc0b4c4ade0cfa9a2a21d91d/bitarray-3.8.1-cp310-cp310-win_arm64.whl", hash = "sha256:8e12d50d4d65c74bd877e15c276992263b878456a7cfcf72521e7205a553557f", size = 146729, upload-time = "2026-04-02T16:26:35.216Z" },
+    { url = "https://files.pythonhosted.org/packages/05/5c/32ace44d0313b4a9986d2abc3a1349744920dafcfb6a4e454a10ed09ef5a/bitarray-3.8.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:660e11b9932f58f10151d0febd11f77d3b0d48d6fa4dd4686d8983f40187101e", size = 149069, upload-time = "2026-04-02T16:26:36.671Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/85/7bd0a218478f0a226ddfb756dd64286f8ee3c61a17991a1a50aae8d89dca/bitarray-3.8.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fb1df55f5700187c6db4b47dbdaf8a0653a111341ac7fccc596b397aa3399e65", size = 146036, upload-time = "2026-04-02T16:26:38.179Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/e9/e4e6aec6874efac185959f4627b6a61a88c0dad3ec92eee433fd395daa78/bitarray-3.8.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:838fd67b3d00c5a64181073282a2c0bf8f76465da4844d5e79d2dbbc64c987dc", size = 333036, upload-time = "2026-04-02T16:26:39.723Z" },
+    { url = "https://files.pythonhosted.org/packages/50/5f/d493eb77f79b58eaa489e9e032aa1c91f6af844287b341c6be681df11b0d/bitarray-3.8.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5743f532e408cfd716fa16776b5a6447b83ff2cf39021fb5f8d052aa0f331508", size = 361247, upload-time = "2026-04-02T16:26:41.023Z" },
+    { url = "https://files.pythonhosted.org/packages/24/a3/2e3f33c66f61754b5bb4724d54c9c1122699facc580bcb416d44f1164ffc/bitarray-3.8.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0c8c66f5d8055cb84ad0ea14af57b3579cb0b6db589f2086f5e33f0922cf2354", size = 371922, upload-time = "2026-04-02T16:26:42.373Z" },
+    { url = "https://files.pythonhosted.org/packages/05/03/4dfca9a69dfa69cde6fdbcfafbc039e069e105ea2443688177f6873d8444/bitarray-3.8.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8c3fe25871f1758519a3ad8dcafb1bd95c5d1aaeb122e6492ac739ab11fa5907", size = 339203, upload-time = "2026-04-02T16:26:43.915Z" },
+    { url = "https://files.pythonhosted.org/packages/14/5d/a2275da6c935893f275624c88afab6cdd5b6aa916d0b45c50dd400cafb20/bitarray-3.8.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e9ff57452fcadfd1a379314234657b8f4e9967ae64480ddf7c2fd82139bc8cf8", size = 330956, upload-time = "2026-04-02T16:26:45.675Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/fd/7f4041c7a7e94ef3e7de86fdb4102d3fe366998b507de77ba0fe5dff6c44/bitarray-3.8.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:4e34f1cb6cdb036c5f4a839a2b74419f75fa36177a70c4bab2867f48973cbe44", size = 358882, upload-time = "2026-04-02T16:26:47.327Z" },
+    { url = "https://files.pythonhosted.org/packages/29/4e/2d0c381327c0f5bc49681b799bbe7d80d5e629079f9609a79d39da6e8b8f/bitarray-3.8.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:698c37fca3761af69a09a1d39cc0492f7e8cb9e263af39a288dce8f3b8a9e2bc", size = 355761, upload-time = "2026-04-02T16:26:48.665Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/d9/66644d45d9f844d1c78b80f3517c8717ac4b4d9853ec61bd02b3cabc06e6/bitarray-3.8.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:81ede1f094f26eeaff62e029ff1bc4e84e9d568f20d4669f64dcf7c7b18a28fc", size = 336422, upload-time = "2026-04-02T16:26:49.988Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/7d/4ea3fd2424535630d4d236bc0c721621260b39878eed669dbc1deb5c6b22/bitarray-3.8.1-cp311-cp311-win32.whl", hash = "sha256:8a345b5dc8ab8cafdf338e08530d48fe3f73df27f4ff569be793c7a7e7bb6b6b", size = 143391, upload-time = "2026-04-02T16:26:51.69Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/4f/46309fcf9e1793c7184e3fc1aa73d7daf2b6a2b0fa1efbcf8d497101690e/bitarray-3.8.1-cp311-cp311-win_amd64.whl", hash = "sha256:ddcd25a1f72b2b545fb27e17882046a6c161f3f24514b2e028c00c58ed73a2dd", size = 150143, upload-time = "2026-04-02T16:26:52.9Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/1e/10289fb8e44fdd2d01adcc24d64b5c45ead709fbec76ee973f42e22b3059/bitarray-3.8.1-cp311-cp311-win_arm64.whl", hash = "sha256:dc2cab92c42991b711132bc52405680e075d1505d4356c4468bc6e9c93d49137", size = 147024, upload-time = "2026-04-02T16:26:54.151Z" },
     { url = "https://files.pythonhosted.org/packages/5d/4f/6ab3767b6642a6cbee4353f10a71fe25ade9899d539fae47c3d50686ebe2/bitarray-3.8.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4494c599effa16064f2b600f6eb28115182d6826847d795a55691339788d8a4d", size = 149202, upload-time = "2026-04-02T16:26:55.635Z" },
     { url = "https://files.pythonhosted.org/packages/eb/53/22bfffd13dd0a266f90011338b24eec45f25c91d37155bb2aa330351e17d/bitarray-3.8.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ff2ca039a161d49a8c713f5380def315c6f793df5fe348b94782b1dbee37a644", size = 145999, upload-time = "2026-04-02T16:26:56.849Z" },
     { url = "https://files.pythonhosted.org/packages/5d/dc/60aff29c88b648e18248921001cf9d7169abeda4d8db96f2dc1a24ed98ca/bitarray-3.8.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df3ffa6ef88166bb36f5d1492e71e664868b9b8b6afd55821e0ac0cb96625441", size = 335945, upload-time = "2026-04-02T16:26:58.403Z" },
@@ -545,13 +690,22 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
     { name = "mypy-extensions" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "packaging" },
     { name = "pathspec" },
     { name = "platformdirs" },
+    { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a2/47/c9997eb470a7f48f7aaddd3d9a828244a2e4199569e38128715c48059ac1/black-24.4.2.tar.gz", hash = "sha256:c872b53057f000085da66a19c55d68f6f8ddcac2642392ad3a355878406fbd4d", size = 642299, upload-time = "2024-04-26T00:32:15.305Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/40/f6/3adc48c210527a7b651aaed43824a9b8bd04b3fb361a5227bad046e1c876/black-24.4.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dd1b5a14e417189db4c7b64a6540f31730713d173f0b63e55fabd52d61d8fdce", size = 1631487, upload-time = "2024-04-26T00:40:28.969Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/25/70aa1bec12c841a03e333e312daa0cf2fee50ea6336ac4851c93c0e2b411/black-24.4.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8e537d281831ad0e71007dcdcbe50a71470b978c453fa41ce77186bbe0ed6021", size = 1456317, upload-time = "2024-04-26T00:39:10.333Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/7d/7f8df0fdbbbefc4362d3eca6b69b7a8a4249a8a88dabc00a207d31fddcd7/black-24.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eaea3008c281f1038edb473c1aa8ed8143a5535ff18f978a318f10302b254063", size = 1822765, upload-time = "2024-04-26T00:34:56.436Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/21/1ee97841c469c1551133cbe47448cdba9628c7d9431f74f114f02e3b233c/black-24.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:7768a0dbf16a39aa5e9a3ded568bb545c8c2727396d063bbaf847df05b08cd96", size = 1409336, upload-time = "2024-04-26T00:35:30.392Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/f7/591d601c3046ceb65b97291dfe87fa25124cffac3d97aaaba89d0f0d7bdf/black-24.4.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:257d724c2c9b1660f353b36c802ccece186a30accc7742c176d29c146df6e474", size = 1615013, upload-time = "2024-04-26T00:39:49.415Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/17/5e0036b265bbf6bc44970d93d48febcbc03701b671db3c9603fd43ebc616/black-24.4.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bdde6f877a18f24844e381d45e9947a49e97933573ac9d4345399be37621e26c", size = 1436163, upload-time = "2024-04-26T00:40:20.267Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/48/34176b522e8cff4620a5d96c2e323ff2413f574870eb25efa8025885e028/black-24.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e151054aa00bad1f4e1f04919542885f89f5f7d086b8a59e5000e6c616896ffb", size = 1803382, upload-time = "2024-04-26T00:34:38.665Z" },
+    { url = "https://files.pythonhosted.org/packages/74/ce/e8eec1a77edbfa982bee3b5460dcdd4fe0e4e3165fc15d8ec44d04da7776/black-24.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:7e122b1c4fb252fd85df3ca93578732b4749d9be076593076ef4d07a0233c3e1", size = 1417802, upload-time = "2024-04-26T00:35:08.804Z" },
     { url = "https://files.pythonhosted.org/packages/f4/75/3a29de3bda4006cc280d833b5d961cf7df3810a21f49e7a63a7e551fb351/black-24.4.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:accf49e151c8ed2c0cdc528691838afd217c50412534e876a19270fea1e28e2d", size = 1645176, upload-time = "2024-04-26T00:42:35.606Z" },
     { url = "https://files.pythonhosted.org/packages/be/b8/9c152301774fa62a265b035a8ede4d6280827904ea1af8c3be10a28d3187/black-24.4.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:88c57dc656038f1ab9f92b3eb5335ee9b021412feaa46330d5eba4e51fe49b04", size = 1446227, upload-time = "2024-04-26T00:40:35.195Z" },
     { url = "https://files.pythonhosted.org/packages/25/6d/eb15a1b155f755f43766cc473618c6e1de6555d6a1764965643f486dcf01/black-24.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be8bef99eb46d5021bf053114442914baeb3649a89dc5f3a555c88737e5e98fc", size = 1832011, upload-time = "2024-04-26T00:34:37.825Z" },
@@ -570,16 +724,16 @@ wheels = [
 
 [[package]]
 name = "botocore"
-version = "1.42.84"
+version = "1.42.70"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jmespath" },
     { name = "python-dateutil" },
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/b4/b7/1c03423843fb0d1795b686511c00ee63fed1234c2400f469aeedfd42212f/botocore-1.42.84.tar.gz", hash = "sha256:234064604c80d9272a5e9f6b3566d260bcaa053a5e05246db90d7eca1c2cf44b", size = 15148615, upload-time = "2026-04-06T19:38:56.673Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/66/54/b80e1fcee4f732e0e9314bbb8679be9d5690caa1566c4a4cd14e9724d2dd/botocore-1.42.70.tar.gz", hash = "sha256:9ee17553b7febd1a0c1253b3b62ab5d79607eb6163c8fb943470a8893c31d4fa", size = 14997068, upload-time = "2026-03-17T19:43:10.678Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e3/37/0c0c90361c8a1b9e6c75222ca24ae12996a298c0e18822a72ab229c37207/botocore-1.42.84-py3-none-any.whl", hash = "sha256:15f3fe07dfa6545e46a60c4b049fe2bdf63803c595ae4a4eec90e8f8172764f3", size = 14827061, upload-time = "2026-04-06T19:38:53.613Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/51/08f32aea872253173f513ba68122f4300966290677c8e59887b4ffd5d957/botocore-1.42.70-py3-none-any.whl", hash = "sha256:54ed9d25f05f810efd22b0dfda0bb9178df3ad8952b2e4359e05156c9321bd3c", size = 14671393, upload-time = "2026-03-17T19:43:06.777Z" },
 ]
 
 [[package]]
@@ -624,8 +778,7 @@ version = "1.6.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "ninja" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "packaging" },
     { name = "torch", marker = "sys_platform == 'never'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/63/15/ec51d77a2df03ee93410f8ee97fceeb7181da213813c51243e9dd6d7e144/causal_conv1d-1.6.1.tar.gz", hash = "sha256:e4a697ec2db3906f012e675125569f8b510b4559bc53e3095143d91369e1221b", size = 29426, upload-time = "2026-03-10T08:56:35.305Z" }
@@ -648,6 +801,31 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/93/d7/516d984057745a6cd96575eea814fe1edd6646ee6efd552fb7b0921dec83/cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44", size = 184283, upload-time = "2025-09-08T23:22:08.01Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/84/ad6a0b408daa859246f57c03efd28e5dd1b33c21737c2db84cae8c237aa5/cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49", size = 180504, upload-time = "2025-09-08T23:22:10.637Z" },
+    { url = "https://files.pythonhosted.org/packages/50/bd/b1a6362b80628111e6653c961f987faa55262b4002fcec42308cad1db680/cffi-2.0.0-cp310-cp310-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c", size = 208811, upload-time = "2025-09-08T23:22:12.267Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/27/6933a8b2562d7bd1fb595074cf99cc81fc3789f6a6c05cdabb46284a3188/cffi-2.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb", size = 216402, upload-time = "2025-09-08T23:22:13.455Z" },
+    { url = "https://files.pythonhosted.org/packages/05/eb/b86f2a2645b62adcfff53b0dd97e8dfafb5c8aa864bd0d9a2c2049a0d551/cffi-2.0.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0", size = 203217, upload-time = "2025-09-08T23:22:14.596Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/e0/6cbe77a53acf5acc7c08cc186c9928864bd7c005f9efd0d126884858a5fe/cffi-2.0.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4", size = 203079, upload-time = "2025-09-08T23:22:15.769Z" },
+    { url = "https://files.pythonhosted.org/packages/98/29/9b366e70e243eb3d14a5cb488dfd3a0b6b2f1fb001a203f653b93ccfac88/cffi-2.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453", size = 216475, upload-time = "2025-09-08T23:22:17.427Z" },
+    { url = "https://files.pythonhosted.org/packages/21/7a/13b24e70d2f90a322f2900c5d8e1f14fa7e2a6b3332b7309ba7b2ba51a5a/cffi-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495", size = 218829, upload-time = "2025-09-08T23:22:19.069Z" },
+    { url = "https://files.pythonhosted.org/packages/60/99/c9dc110974c59cc981b1f5b66e1d8af8af764e00f0293266824d9c4254bc/cffi-2.0.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5", size = 211211, upload-time = "2025-09-08T23:22:20.588Z" },
+    { url = "https://files.pythonhosted.org/packages/49/72/ff2d12dbf21aca1b32a40ed792ee6b40f6dc3a9cf1644bd7ef6e95e0ac5e/cffi-2.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb", size = 218036, upload-time = "2025-09-08T23:22:22.143Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/cc/027d7fb82e58c48ea717149b03bcadcbdc293553edb283af792bd4bcbb3f/cffi-2.0.0-cp310-cp310-win32.whl", hash = "sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a", size = 172184, upload-time = "2025-09-08T23:22:23.328Z" },
+    { url = "https://files.pythonhosted.org/packages/33/fa/072dd15ae27fbb4e06b437eb6e944e75b068deb09e2a2826039e49ee2045/cffi-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739", size = 182790, upload-time = "2025-09-08T23:22:24.752Z" },
+    { url = "https://files.pythonhosted.org/packages/12/4a/3dfd5f7850cbf0d06dc84ba9aa00db766b52ca38d8b86e3a38314d52498c/cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe", size = 184344, upload-time = "2025-09-08T23:22:26.456Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/8b/f0e4c441227ba756aafbe78f117485b25bb26b1c059d01f137fa6d14896b/cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c", size = 180560, upload-time = "2025-09-08T23:22:28.197Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" },
+    { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" },
+    { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/c0/015b25184413d7ab0a410775fdb4a50fca20f5589b5dab1dbbfa3baad8ce/cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5", size = 172076, upload-time = "2025-09-08T23:22:40.95Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/8f/dc5531155e7070361eb1b7e4c1a9d896d0cb21c49f807a6c03fd63fc877e/cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5", size = 182820, upload-time = "2025-09-08T23:22:42.463Z" },
+    { url = "https://files.pythonhosted.org/packages/95/5c/1b493356429f9aecfd56bc171285a4c4ac8697f76e9bbbbb105e537853a1/cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d", size = 177635, upload-time = "2025-09-08T23:22:43.623Z" },
     { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" },
     { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" },
     { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" },
@@ -702,6 +880,38 @@ version = "3.4.7"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271, upload-time = "2026-04-02T09:28:39.342Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/08/0f303cb0b529e456bb116f2d50565a482694fbb94340bf56d44677e7ed03/charset_normalizer-3.4.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cdd68a1fb318e290a2077696b7eb7a21a49163c455979c639bf5a5dcdc46617d", size = 315182, upload-time = "2026-04-02T09:25:40.673Z" },
+    { url = "https://files.pythonhosted.org/packages/24/47/b192933e94b546f1b1fe4df9cc1f84fcdbf2359f8d1081d46dd029b50207/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e17b8d5d6a8c47c85e68ca8379def1303fd360c3e22093a807cd34a71cd082b8", size = 209329, upload-time = "2026-04-02T09:25:42.354Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/b4/01fa81c5ca6141024d89a8fc15968002b71da7f825dd14113207113fabbd/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:511ef87c8aec0783e08ac18565a16d435372bc1ac25a91e6ac7f5ef2b0bff790", size = 231230, upload-time = "2026-04-02T09:25:44.281Z" },
+    { url = "https://files.pythonhosted.org/packages/20/f7/7b991776844dfa058017e600e6e55ff01984a063290ca5622c0b63162f68/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:007d05ec7321d12a40227aae9e2bc6dca73f3cb21058999a1df9e193555a9dcc", size = 225890, upload-time = "2026-04-02T09:25:45.475Z" },
+    { url = "https://files.pythonhosted.org/packages/20/e7/bed0024a0f4ab0c8a9c64d4445f39b30c99bd1acd228291959e3de664247/charset_normalizer-3.4.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:cf29836da5119f3c8a8a70667b0ef5fdca3bb12f80fd06487cfa575b3909b393", size = 216930, upload-time = "2026-04-02T09:25:46.58Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/ab/b18f0ab31cdd7b3ddb8bb76c4a414aeb8160c9810fdf1bc62f269a539d87/charset_normalizer-3.4.7-cp310-cp310-manylinux_2_31_armv7l.whl", hash = "sha256:12d8baf840cc7889b37c7c770f478adea7adce3dcb3944d02ec87508e2dcf153", size = 202109, upload-time = "2026-04-02T09:25:48.031Z" },
+    { url = "https://files.pythonhosted.org/packages/82/e5/7e9440768a06dfb3075936490cb82dbf0ee20a133bf0dd8551fa096914ec/charset_normalizer-3.4.7-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:d560742f3c0d62afaccf9f41fe485ed69bd7661a241f86a3ef0f0fb8b1a397af", size = 214684, upload-time = "2026-04-02T09:25:49.245Z" },
+    { url = "https://files.pythonhosted.org/packages/71/94/8c61d8da9f062fdf457c80acfa25060ec22bf1d34bbeaca4350f13bcfd07/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b14b2d9dac08e28bb8046a1a0434b1750eb221c8f5b87a68f4fa11a6f97b5e34", size = 212785, upload-time = "2026-04-02T09:25:50.671Z" },
+    { url = "https://files.pythonhosted.org/packages/66/cd/6e9889c648e72c0ab2e5967528bb83508f354d706637bc7097190c874e13/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:bc17a677b21b3502a21f66a8cc64f5bfad4df8a0b8434d661666f8ce90ac3af1", size = 203055, upload-time = "2026-04-02T09:25:51.802Z" },
+    { url = "https://files.pythonhosted.org/packages/92/2e/7a951d6a08aefb7eb8e1b54cdfb580b1365afdd9dd484dc4bee9e5d8f258/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:750e02e074872a3fad7f233b47734166440af3cdea0add3e95163110816d6752", size = 232502, upload-time = "2026-04-02T09:25:53.388Z" },
+    { url = "https://files.pythonhosted.org/packages/58/d5/abcf2d83bf8e0a1286df55cd0dc1d49af0da4282aa77e986df343e7de124/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:4e5163c14bffd570ef2affbfdd77bba66383890797df43dc8b4cc7d6f500bf53", size = 214295, upload-time = "2026-04-02T09:25:54.765Z" },
+    { url = "https://files.pythonhosted.org/packages/47/3a/7d4cd7ed54be99973a0dc176032cba5cb1f258082c31fa6df35cff46acfc/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:6ed74185b2db44f41ef35fd1617c5888e59792da9bbc9190d6c7300617182616", size = 227145, upload-time = "2026-04-02T09:25:55.904Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/98/3a45bf8247889cf28262ebd3d0872edff11565b2a1e3064ccb132db3fbb0/charset_normalizer-3.4.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:94e1885b270625a9a828c9793b4d52a64445299baa1fea5a173bf1d3dd9a1a5a", size = 218884, upload-time = "2026-04-02T09:25:57.074Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/80/2e8b7f8915ed5c9ef13aa828d82738e33888c485b65ebf744d615040c7ea/charset_normalizer-3.4.7-cp310-cp310-win32.whl", hash = "sha256:6785f414ae0f3c733c437e0f3929197934f526d19dfaa75e18fdb4f94c6fb374", size = 148343, upload-time = "2026-04-02T09:25:58.199Z" },
+    { url = "https://files.pythonhosted.org/packages/35/1b/3b8c8c77184af465ee9ad88b5aea46ea6b2e1f7b9dc9502891e37af21e30/charset_normalizer-3.4.7-cp310-cp310-win_amd64.whl", hash = "sha256:6696b7688f54f5af4462118f0bfa7c1621eeb87154f77fa04b9295ce7a8f2943", size = 159174, upload-time = "2026-04-02T09:25:59.322Z" },
+    { url = "https://files.pythonhosted.org/packages/be/c1/feb40dca40dbb21e0a908801782d9288c64fc8d8e562c2098e9994c8c21b/charset_normalizer-3.4.7-cp310-cp310-win_arm64.whl", hash = "sha256:66671f93accb62ed07da56613636f3641f1a12c13046ce91ffc923721f23c008", size = 147805, upload-time = "2026-04-02T09:26:00.756Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/d7/b5b7020a0565c2e9fa8c09f4b5fa6232feb326b8c20081ccded47ea368fd/charset_normalizer-3.4.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7641bb8895e77f921102f72833904dcd9901df5d6d72a2ab8f31d04b7e51e4e7", size = 309705, upload-time = "2026-04-02T09:26:02.191Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/53/58c29116c340e5456724ecd2fff4196d236b98f3da97b404bc5e51ac3493/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:202389074300232baeb53ae2569a60901f7efadd4245cf3a3bf0617d60b439d7", size = 206419, upload-time = "2026-04-02T09:26:03.583Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/02/e8146dc6591a37a00e5144c63f29fb7c97a734ea8a111190783c0e60ab63/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:30b8d1d8c52a48c2c5690e152c169b673487a2a58de1ec7393196753063fcd5e", size = 227901, upload-time = "2026-04-02T09:26:04.738Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/73/77486c4cd58f1267bf17db420e930c9afa1b3be3fe8c8b8ebbebc9624359/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:532bc9bf33a68613fd7d65e4b1c71a6a38d7d42604ecf239c77392e9b4e8998c", size = 222742, upload-time = "2026-04-02T09:26:06.36Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/fa/f74eb381a7d94ded44739e9d94de18dc5edc9c17fb8c11f0a6890696c0a9/charset_normalizer-3.4.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2fe249cb4651fd12605b7288b24751d8bfd46d35f12a20b1ba33dea122e690df", size = 214061, upload-time = "2026-04-02T09:26:08.347Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/92/42bd3cefcf7687253fb86694b45f37b733c97f59af3724f356fa92b8c344/charset_normalizer-3.4.7-cp311-cp311-manylinux_2_31_armv7l.whl", hash = "sha256:65bcd23054beab4d166035cabbc868a09c1a49d1efe458fe8e4361215df40265", size = 199239, upload-time = "2026-04-02T09:26:09.823Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/3d/069e7184e2aa3b3cddc700e3dd267413dc259854adc3380421c805c6a17d/charset_normalizer-3.4.7-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:08e721811161356f97b4059a9ba7bafb23ea5ee2255402c42881c214e173c6b4", size = 210173, upload-time = "2026-04-02T09:26:10.953Z" },
+    { url = "https://files.pythonhosted.org/packages/62/51/9d56feb5f2e7074c46f93e0ebdbe61f0848ee246e2f0d89f8e20b89ebb8f/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:e060d01aec0a910bdccb8be71faf34e7799ce36950f8294c8bf612cba65a2c9e", size = 209841, upload-time = "2026-04-02T09:26:12.142Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/59/893d8f99cc4c837dda1fe2f1139079703deb9f321aabcb032355de13b6c7/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:38c0109396c4cfc574d502df99742a45c72c08eff0a36158b6f04000043dbf38", size = 200304, upload-time = "2026-04-02T09:26:13.711Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/1d/ee6f3be3464247578d1ed5c46de545ccc3d3ff933695395c402c21fa6b77/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:1c2a768fdd44ee4a9339a9b0b130049139b8ce3c01d2ce09f67f5a68048d477c", size = 229455, upload-time = "2026-04-02T09:26:14.941Z" },
+    { url = "https://files.pythonhosted.org/packages/54/bb/8fb0a946296ea96a488928bdce8ef99023998c48e4713af533e9bb98ef07/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:1a87ca9d5df6fe460483d9a5bbf2b18f620cbed41b432e2bddb686228282d10b", size = 210036, upload-time = "2026-04-02T09:26:16.478Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/bc/015b2387f913749f82afd4fcba07846d05b6d784dd16123cb66860e0237d/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:d635aab80466bc95771bb78d5370e74d36d1fe31467b6b29b8b57b2a3cd7d22c", size = 224739, upload-time = "2026-04-02T09:26:17.751Z" },
+    { url = "https://files.pythonhosted.org/packages/17/ab/63133691f56baae417493cba6b7c641571a2130eb7bceba6773367ab9ec5/charset_normalizer-3.4.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ae196f021b5e7c78e918242d217db021ed2a6ace2bc6ae94c0fc596221c7f58d", size = 216277, upload-time = "2026-04-02T09:26:18.981Z" },
+    { url = "https://files.pythonhosted.org/packages/06/6d/3be70e827977f20db77c12a97e6a9f973631a45b8d186c084527e53e77a4/charset_normalizer-3.4.7-cp311-cp311-win32.whl", hash = "sha256:adb2597b428735679446b46c8badf467b4ca5f5056aae4d51a19f9570301b1ad", size = 147819, upload-time = "2026-04-02T09:26:20.295Z" },
+    { url = "https://files.pythonhosted.org/packages/20/d9/5f67790f06b735d7c7637171bbfd89882ad67201891b7275e51116ed8207/charset_normalizer-3.4.7-cp311-cp311-win_amd64.whl", hash = "sha256:8e385e4267ab76874ae30db04c627faaaf0b509e1ccc11a95b3fc3e83f855c00", size = 159281, upload-time = "2026-04-02T09:26:21.74Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/83/6413f36c5a34afead88ce6f66684d943d91f233d76dd083798f9602b75ae/charset_normalizer-3.4.7-cp311-cp311-win_arm64.whl", hash = "sha256:d4a48e5b3c2a489fae013b7589308a40146ee081f6f509e047e0e096084ceca1", size = 147843, upload-time = "2026-04-02T09:26:22.901Z" },
     { url = "https://files.pythonhosted.org/packages/0c/eb/4fc8d0a7110eb5fc9cc161723a34a8a6c200ce3b4fbf681bc86feee22308/charset_normalizer-3.4.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46", size = 311328, upload-time = "2026-04-02T09:26:24.331Z" },
     { url = "https://files.pythonhosted.org/packages/f8/e3/0fadc706008ac9d7b9b5be6dc767c05f9d3e5df51744ce4cc9605de7b9f4/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2", size = 208061, upload-time = "2026-04-02T09:26:25.568Z" },
     { url = "https://files.pythonhosted.org/packages/42/f0/3dd1045c47f4a4604df85ec18ad093912ae1344ac706993aff91d38773a2/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b", size = 229031, upload-time = "2026-04-02T09:26:26.865Z" },
@@ -826,6 +1036,35 @@ version = "7.13.5"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/9d/e0/70553e3000e345daff267cec284ce4cbf3fc141b6da229ac52775b5428f1/coverage-7.13.5.tar.gz", hash = "sha256:c81f6515c4c40141f83f502b07bbfa5c240ba25bbe73da7b33f1e5b6120ff179", size = 915967, upload-time = "2026-03-17T10:33:18.341Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/69/33/e8c48488c29a73fd089f9d71f9653c1be7478f2ad6b5bc870db11a55d23d/coverage-7.13.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e0723d2c96324561b9aa76fb982406e11d93cdb388a7a7da2b16e04719cf7ca5", size = 219255, upload-time = "2026-03-17T10:29:51.081Z" },
+    { url = "https://files.pythonhosted.org/packages/da/bd/b0ebe9f677d7f4b74a3e115eec7ddd4bcf892074963a00d91e8b164a6386/coverage-7.13.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:52f444e86475992506b32d4e5ca55c24fc88d73bcbda0e9745095b28ef4dc0cf", size = 219772, upload-time = "2026-03-17T10:29:52.867Z" },
+    { url = "https://files.pythonhosted.org/packages/48/cc/5cb9502f4e01972f54eedd48218bb203fe81e294be606a2bc93970208013/coverage-7.13.5-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:704de6328e3d612a8f6c07000a878ff38181ec3263d5a11da1db294fa6a9bdf8", size = 246532, upload-time = "2026-03-17T10:29:54.688Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/d8/3217636d86c7e7b12e126e4f30ef1581047da73140614523af7495ed5f2d/coverage-7.13.5-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a1a6d79a14e1ec1832cabc833898636ad5f3754a678ef8bb4908515208bf84f4", size = 248333, upload-time = "2026-03-17T10:29:56.221Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/30/2002ac6729ba2d4357438e2ed3c447ad8562866c8c63fc16f6dfc33afe56/coverage-7.13.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79060214983769c7ba3f0cee10b54c97609dca4d478fa1aa32b914480fd5738d", size = 250211, upload-time = "2026-03-17T10:29:57.938Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/85/552496626d6b9359eb0e2f86f920037c9cbfba09b24d914c6e1528155f7d/coverage-7.13.5-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:356e76b46783a98c2a2fe81ec79df4883a1e62895ea952968fb253c114e7f930", size = 252125, upload-time = "2026-03-17T10:29:59.388Z" },
+    { url = "https://files.pythonhosted.org/packages/44/21/40256eabdcbccdb6acf6b381b3016a154399a75fe39d406f790ae84d1f3c/coverage-7.13.5-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0cef0cdec915d11254a7f549c1170afecce708d30610c6abdded1f74e581666d", size = 247219, upload-time = "2026-03-17T10:30:01.199Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/e8/96e2a6c3f21a0ea77d7830b254a1542d0328acc8d7bdf6a284ba7e529f77/coverage-7.13.5-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dc022073d063b25a402454e5712ef9e007113e3a676b96c5f29b2bda29352f40", size = 248248, upload-time = "2026-03-17T10:30:03.317Z" },
+    { url = "https://files.pythonhosted.org/packages/da/ba/8477f549e554827da390ec659f3c38e4b6d95470f4daafc2d8ff94eaa9c2/coverage-7.13.5-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:9b74db26dfea4f4e50d48a4602207cd1e78be33182bc9cbf22da94f332f99878", size = 246254, upload-time = "2026-03-17T10:30:04.832Z" },
+    { url = "https://files.pythonhosted.org/packages/55/59/bc22aef0e6aa179d5b1b001e8b3654785e9adf27ef24c93dc4228ebd5d68/coverage-7.13.5-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:ad146744ca4fd09b50c482650e3c1b1f4dfa1d4792e0a04a369c7f23336f0400", size = 250067, upload-time = "2026-03-17T10:30:06.535Z" },
+    { url = "https://files.pythonhosted.org/packages/de/1b/c6a023a160806a5137dca53468fd97530d6acad24a22003b1578a9c2e429/coverage-7.13.5-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:c555b48be1853fe3997c11c4bd521cdd9a9612352de01fa4508f16ec341e6fe0", size = 246521, upload-time = "2026-03-17T10:30:08.486Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/3f/3532c85a55aa2f899fa17c186f831cfa1aa434d88ff792a709636f64130e/coverage-7.13.5-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7034b5c56a58ae5e85f23949d52c14aca2cfc6848a31764995b7de88f13a1ea0", size = 247126, upload-time = "2026-03-17T10:30:09.966Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/2e/b9d56af4a24ef45dfbcda88e06870cb7d57b2b0bfa3a888d79b4c8debd76/coverage-7.13.5-cp310-cp310-win32.whl", hash = "sha256:eb7fdf1ef130660e7415e0253a01a7d5a88c9c4d158bcf75cbbd922fd65a5b58", size = 221860, upload-time = "2026-03-17T10:30:11.393Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/cc/d938417e7a4d7f0433ad4edee8bb2acdc60dc7ac5af19e2a07a048ecbee3/coverage-7.13.5-cp310-cp310-win_amd64.whl", hash = "sha256:3e1bb5f6c78feeb1be3475789b14a0f0a5b47d505bfc7267126ccbd50289999e", size = 222788, upload-time = "2026-03-17T10:30:12.886Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/37/d24c8f8220ff07b839b2c043ea4903a33b0f455abe673ae3c03bbdb7f212/coverage-7.13.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:66a80c616f80181f4d643b0f9e709d97bcea413ecd9631e1dedc7401c8e6695d", size = 219381, upload-time = "2026-03-17T10:30:14.68Z" },
+    { url = "https://files.pythonhosted.org/packages/35/8b/cd129b0ca4afe886a6ce9d183c44d8301acbd4ef248622e7c49a23145605/coverage-7.13.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:145ede53ccbafb297c1c9287f788d1bc3efd6c900da23bf6931b09eafc931587", size = 219880, upload-time = "2026-03-17T10:30:16.231Z" },
+    { url = "https://files.pythonhosted.org/packages/55/2f/e0e5b237bffdb5d6c530ce87cc1d413a5b7d7dfd60fb067ad6d254c35c76/coverage-7.13.5-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:0672854dc733c342fa3e957e0605256d2bf5934feeac328da9e0b5449634a642", size = 250303, upload-time = "2026-03-17T10:30:17.748Z" },
+    { url = "https://files.pythonhosted.org/packages/92/be/b1afb692be85b947f3401375851484496134c5554e67e822c35f28bf2fbc/coverage-7.13.5-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ec10e2a42b41c923c2209b846126c6582db5e43a33157e9870ba9fb70dc7854b", size = 252218, upload-time = "2026-03-17T10:30:19.804Z" },
+    { url = "https://files.pythonhosted.org/packages/da/69/2f47bb6fa1b8d1e3e5d0c4be8ccb4313c63d742476a619418f85740d597b/coverage-7.13.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:be3d4bbad9d4b037791794ddeedd7d64a56f5933a2c1373e18e9e568b9141686", size = 254326, upload-time = "2026-03-17T10:30:21.321Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/d0/79db81da58965bd29dabc8f4ad2a2af70611a57cba9d1ec006f072f30a54/coverage-7.13.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:4d2afbc5cc54d286bfb54541aa50b64cdb07a718227168c87b9e2fb8f25e1743", size = 256267, upload-time = "2026-03-17T10:30:23.094Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/32/d0d7cc8168f91ddab44c0ce4806b969df5f5fdfdbb568eaca2dbc2a04936/coverage-7.13.5-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3ad050321264c49c2fa67bb599100456fc51d004b82534f379d16445da40fb75", size = 250430, upload-time = "2026-03-17T10:30:25.311Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/06/a055311d891ddbe231cd69fdd20ea4be6e3603ffebddf8704b8ca8e10a3c/coverage-7.13.5-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7300c8a6d13335b29bb76d7651c66af6bd8658517c43499f110ddc6717bfc209", size = 252017, upload-time = "2026-03-17T10:30:27.284Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/f6/d0fd2d21e29a657b5f77a2fe7082e1568158340dceb941954f776dce1b7b/coverage-7.13.5-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:eb07647a5738b89baab047f14edd18ded523de60f3b30e75c2acc826f79c839a", size = 250080, upload-time = "2026-03-17T10:30:29.481Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/ab/0d7fb2efc2e9a5eb7ddcc6e722f834a69b454b7e6e5888c3a8567ecffb31/coverage-7.13.5-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:9adb6688e3b53adffefd4a52d72cbd8b02602bfb8f74dcd862337182fd4d1a4e", size = 253843, upload-time = "2026-03-17T10:30:31.301Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/6f/7467b917bbf5408610178f62a49c0ed4377bb16c1657f689cc61470da8ce/coverage-7.13.5-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7c8d4bc913dd70b93488d6c496c77f3aff5ea99a07e36a18f865bca55adef8bd", size = 249802, upload-time = "2026-03-17T10:30:33.358Z" },
+    { url = "https://files.pythonhosted.org/packages/75/2c/1172fb689df92135f5bfbbd69fc83017a76d24ea2e2f3a1154007e2fb9f8/coverage-7.13.5-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:0e3c426ffc4cd952f54ee9ffbdd10345709ecc78a3ecfd796a57236bfad0b9b8", size = 250707, upload-time = "2026-03-17T10:30:35.2Z" },
+    { url = "https://files.pythonhosted.org/packages/67/21/9ac389377380a07884e3b48ba7a620fcd9dbfaf1d40565facdc6b36ec9ef/coverage-7.13.5-cp311-cp311-win32.whl", hash = "sha256:259b69bb83ad9894c4b25be2528139eecba9a82646ebdda2d9db1ba28424a6bf", size = 221880, upload-time = "2026-03-17T10:30:36.775Z" },
+    { url = "https://files.pythonhosted.org/packages/af/7f/4cd8a92531253f9d7c1bbecd9fa1b472907fb54446ca768c59b531248dc5/coverage-7.13.5-cp311-cp311-win_amd64.whl", hash = "sha256:258354455f4e86e3e9d0d17571d522e13b4e1e19bf0f8596bcf9476d61e7d8a9", size = 222816, upload-time = "2026-03-17T10:30:38.891Z" },
+    { url = "https://files.pythonhosted.org/packages/12/a6/1d3f6155fb0010ca68eba7fe48ca6c9da7385058b77a95848710ecf189b1/coverage-7.13.5-cp311-cp311-win_arm64.whl", hash = "sha256:bff95879c33ec8da99fc9b6fe345ddb5be6414b41d6d1ad1c8f188d26f36e028", size = 221483, upload-time = "2026-03-17T10:30:40.463Z" },
     { url = "https://files.pythonhosted.org/packages/a0/c3/a396306ba7db865bf96fc1fb3b7fd29bcbf3d829df642e77b13555163cd6/coverage-7.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:460cf0114c5016fa841214ff5564aa4864f11948da9440bc97e21ad1f4ba1e01", size = 219554, upload-time = "2026-03-17T10:30:42.208Z" },
     { url = "https://files.pythonhosted.org/packages/a6/16/a68a19e5384e93f811dccc51034b1fd0b865841c390e3c931dcc4699e035/coverage-7.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0e223ce4b4ed47f065bfb123687686512e37629be25cc63728557ae7db261422", size = 219908, upload-time = "2026-03-17T10:30:43.906Z" },
     { url = "https://files.pythonhosted.org/packages/29/72/20b917c6793af3a5ceb7fb9c50033f3ec7865f2911a1416b34a7cfa0813b/coverage-7.13.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:6e3370441f4513c6252bf042b9c36d22491142385049243253c7e48398a15a9f", size = 251419, upload-time = "2026-03-17T10:30:45.545Z" },
@@ -904,57 +1143,69 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/ee/a4cf96b8ce1e566ed238f0659ac2d3f007ed1d14b181bcb684e19561a69a/coverage-7.13.5-py3-none-any.whl", hash = "sha256:34b02417cf070e173989b3db962f7ed56d2f644307b2cf9d5a0f258e13084a61", size = 211346, upload-time = "2026-03-17T10:33:15.691Z" },
 ]
 
+[package.optional-dependencies]
+toml = [
+    { name = "tomli", marker = "python_full_version <= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+
 [[package]]
 name = "cryptography"
-version = "46.0.7"
+version = "46.0.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cffi", marker = "platform_python_implementation != 'PyPy'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/47/93/ac8f3d5ff04d54bc814e961a43ae5b0b146154c89c61b47bb07557679b18/cryptography-46.0.7.tar.gz", hash = "sha256:e4cfd68c5f3e0bfdad0d38e023239b96a2fe84146481852dffbcca442c245aa5", size = 750652, upload-time = "2026-04-08T01:57:54.692Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/0b/5d/4a8f770695d73be252331e60e526291e3df0c9b27556a90a6b47bccca4c2/cryptography-46.0.7-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:ea42cbe97209df307fdc3b155f1b6fa2577c0defa8f1f7d3be7d31d189108ad4", size = 7179869, upload-time = "2026-04-08T01:56:17.157Z" },
-    { url = "https://files.pythonhosted.org/packages/5f/45/6d80dc379b0bbc1f9d1e429f42e4cb9e1d319c7a8201beffd967c516ea01/cryptography-46.0.7-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b36a4695e29fe69215d75960b22577197aca3f7a25b9cf9d165dcfe9d80bc325", size = 4275492, upload-time = "2026-04-08T01:56:19.36Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/9a/1765afe9f572e239c3469f2cb429f3ba7b31878c893b246b4b2994ffe2fe/cryptography-46.0.7-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5ad9ef796328c5e3c4ceed237a183f5d41d21150f972455a9d926593a1dcb308", size = 4426670, upload-time = "2026-04-08T01:56:21.415Z" },
-    { url = "https://files.pythonhosted.org/packages/8f/3e/af9246aaf23cd4ee060699adab1e47ced3f5f7e7a8ffdd339f817b446462/cryptography-46.0.7-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:73510b83623e080a2c35c62c15298096e2a5dc8d51c3b4e1740211839d0dea77", size = 4280275, upload-time = "2026-04-08T01:56:23.539Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/54/6bbbfc5efe86f9d71041827b793c24811a017c6ac0fd12883e4caa86b8ed/cryptography-46.0.7-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:cbd5fb06b62bd0721e1170273d3f4d5a277044c47ca27ee257025146c34cbdd1", size = 4928402, upload-time = "2026-04-08T01:56:25.624Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/cf/054b9d8220f81509939599c8bdbc0c408dbd2bdd41688616a20731371fe0/cryptography-46.0.7-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:420b1e4109cc95f0e5700eed79908cef9268265c773d3a66f7af1eef53d409ef", size = 4459985, upload-time = "2026-04-08T01:56:27.309Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/46/4e4e9c6040fb01c7467d47217d2f882daddeb8828f7df800cb806d8a2288/cryptography-46.0.7-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:24402210aa54baae71d99441d15bb5a1919c195398a87b563df84468160a65de", size = 3990652, upload-time = "2026-04-08T01:56:29.095Z" },
-    { url = "https://files.pythonhosted.org/packages/36/5f/313586c3be5a2fbe87e4c9a254207b860155a8e1f3cca99f9910008e7d08/cryptography-46.0.7-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:8a469028a86f12eb7d2fe97162d0634026d92a21f3ae0ac87ed1c4a447886c83", size = 4279805, upload-time = "2026-04-08T01:56:30.928Z" },
-    { url = "https://files.pythonhosted.org/packages/69/33/60dfc4595f334a2082749673386a4d05e4f0cf4df8248e63b2c3437585f2/cryptography-46.0.7-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:9694078c5d44c157ef3162e3bf3946510b857df5a3955458381d1c7cfc143ddb", size = 4892883, upload-time = "2026-04-08T01:56:32.614Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/0b/333ddab4270c4f5b972f980adef4faa66951a4aaf646ca067af597f15563/cryptography-46.0.7-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:42a1e5f98abb6391717978baf9f90dc28a743b7d9be7f0751a6f56a75d14065b", size = 4459756, upload-time = "2026-04-08T01:56:34.306Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/14/633913398b43b75f1234834170947957c6b623d1701ffc7a9600da907e89/cryptography-46.0.7-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:91bbcb08347344f810cbe49065914fe048949648f6bd5c2519f34619142bbe85", size = 4410244, upload-time = "2026-04-08T01:56:35.977Z" },
-    { url = "https://files.pythonhosted.org/packages/10/f2/19ceb3b3dc14009373432af0c13f46aa08e3ce334ec6eff13492e1812ccd/cryptography-46.0.7-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5d1c02a14ceb9148cc7816249f64f623fbfee39e8c03b3650d842ad3f34d637e", size = 4674868, upload-time = "2026-04-08T01:56:38.034Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/bb/a5c213c19ee94b15dfccc48f363738633a493812687f5567addbcbba9f6f/cryptography-46.0.7-cp311-abi3-win32.whl", hash = "sha256:d23c8ca48e44ee015cd0a54aeccdf9f09004eba9fc96f38c911011d9ff1bd457", size = 3026504, upload-time = "2026-04-08T01:56:39.666Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/02/7788f9fefa1d060ca68717c3901ae7fffa21ee087a90b7f23c7a603c32ae/cryptography-46.0.7-cp311-abi3-win_amd64.whl", hash = "sha256:397655da831414d165029da9bc483bed2fe0e75dde6a1523ec2fe63f3c46046b", size = 3488363, upload-time = "2026-04-08T01:56:41.893Z" },
-    { url = "https://files.pythonhosted.org/packages/7b/56/15619b210e689c5403bb0540e4cb7dbf11a6bf42e483b7644e471a2812b3/cryptography-46.0.7-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:d151173275e1728cf7839aaa80c34fe550c04ddb27b34f48c232193df8db5842", size = 7119671, upload-time = "2026-04-08T01:56:44Z" },
-    { url = "https://files.pythonhosted.org/packages/74/66/e3ce040721b0b5599e175ba91ab08884c75928fbeb74597dd10ef13505d2/cryptography-46.0.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:db0f493b9181c7820c8134437eb8b0b4792085d37dbb24da050476ccb664e59c", size = 4268551, upload-time = "2026-04-08T01:56:46.071Z" },
-    { url = "https://files.pythonhosted.org/packages/03/11/5e395f961d6868269835dee1bafec6a1ac176505a167f68b7d8818431068/cryptography-46.0.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ebd6daf519b9f189f85c479427bbd6e9c9037862cf8fe89ee35503bd209ed902", size = 4408887, upload-time = "2026-04-08T01:56:47.718Z" },
-    { url = "https://files.pythonhosted.org/packages/40/53/8ed1cf4c3b9c8e611e7122fb56f1c32d09e1fff0f1d77e78d9ff7c82653e/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:b7b412817be92117ec5ed95f880defe9cf18a832e8cafacf0a22337dc1981b4d", size = 4271354, upload-time = "2026-04-08T01:56:49.312Z" },
-    { url = "https://files.pythonhosted.org/packages/50/46/cf71e26025c2e767c5609162c866a78e8a2915bbcfa408b7ca495c6140c4/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:fbfd0e5f273877695cb93baf14b185f4878128b250cc9f8e617ea0c025dfb022", size = 4905845, upload-time = "2026-04-08T01:56:50.916Z" },
-    { url = "https://files.pythonhosted.org/packages/c0/ea/01276740375bac6249d0a971ebdf6b4dc9ead0ee0a34ef3b5a88c1a9b0d4/cryptography-46.0.7-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:ffca7aa1d00cf7d6469b988c581598f2259e46215e0140af408966a24cf086ce", size = 4444641, upload-time = "2026-04-08T01:56:52.882Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/4c/7d258f169ae71230f25d9f3d06caabcff8c3baf0978e2b7d65e0acac3827/cryptography-46.0.7-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:60627cf07e0d9274338521205899337c5d18249db56865f943cbe753aa96f40f", size = 3967749, upload-time = "2026-04-08T01:56:54.597Z" },
-    { url = "https://files.pythonhosted.org/packages/b5/2a/2ea0767cad19e71b3530e4cad9605d0b5e338b6a1e72c37c9c1ceb86c333/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:80406c3065e2c55d7f49a9550fe0c49b3f12e5bfff5dedb727e319e1afb9bf99", size = 4270942, upload-time = "2026-04-08T01:56:56.416Z" },
-    { url = "https://files.pythonhosted.org/packages/41/3d/fe14df95a83319af25717677e956567a105bb6ab25641acaa093db79975d/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:c5b1ccd1239f48b7151a65bc6dd54bcfcc15e028c8ac126d3fada09db0e07ef1", size = 4871079, upload-time = "2026-04-08T01:56:58.31Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/59/4a479e0f36f8f378d397f4eab4c850b4ffb79a2f0d58704b8fa0703ddc11/cryptography-46.0.7-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:d5f7520159cd9c2154eb61eb67548ca05c5774d39e9c2c4339fd793fe7d097b2", size = 4443999, upload-time = "2026-04-08T01:57:00.508Z" },
-    { url = "https://files.pythonhosted.org/packages/28/17/b59a741645822ec6d04732b43c5d35e4ef58be7bfa84a81e5ae6f05a1d33/cryptography-46.0.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:fcd8eac50d9138c1d7fc53a653ba60a2bee81a505f9f8850b6b2888555a45d0e", size = 4399191, upload-time = "2026-04-08T01:57:02.654Z" },
-    { url = "https://files.pythonhosted.org/packages/59/6a/bb2e166d6d0e0955f1e9ff70f10ec4b2824c9cfcdb4da772c7dd69cc7d80/cryptography-46.0.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:65814c60f8cc400c63131584e3e1fad01235edba2614b61fbfbfa954082db0ee", size = 4655782, upload-time = "2026-04-08T01:57:04.592Z" },
-    { url = "https://files.pythonhosted.org/packages/95/b6/3da51d48415bcb63b00dc17c2eff3a651b7c4fed484308d0f19b30e8cb2c/cryptography-46.0.7-cp314-cp314t-win32.whl", hash = "sha256:fdd1736fed309b4300346f88f74cd120c27c56852c3838cab416e7a166f67298", size = 3002227, upload-time = "2026-04-08T01:57:06.91Z" },
-    { url = "https://files.pythonhosted.org/packages/32/a8/9f0e4ed57ec9cebe506e58db11ae472972ecb0c659e4d52bbaee80ca340a/cryptography-46.0.7-cp314-cp314t-win_amd64.whl", hash = "sha256:e06acf3c99be55aa3b516397fe42f5855597f430add9c17fa46bf2e0fb34c9bb", size = 3475332, upload-time = "2026-04-08T01:57:08.807Z" },
-    { url = "https://files.pythonhosted.org/packages/a7/7f/cd42fc3614386bc0c12f0cb3c4ae1fc2bbca5c9662dfed031514911d513d/cryptography-46.0.7-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:462ad5cb1c148a22b2e3bcc5ad52504dff325d17daf5df8d88c17dda1f75f2a4", size = 7165618, upload-time = "2026-04-08T01:57:10.645Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/d0/36a49f0262d2319139d2829f773f1b97ef8aef7f97e6e5bd21455e5a8fb5/cryptography-46.0.7-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:84d4cced91f0f159a7ddacad249cc077e63195c36aac40b4150e7a57e84fffe7", size = 4270628, upload-time = "2026-04-08T01:57:12.885Z" },
-    { url = "https://files.pythonhosted.org/packages/8a/6c/1a42450f464dda6ffbe578a911f773e54dd48c10f9895a23a7e88b3e7db5/cryptography-46.0.7-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:128c5edfe5e5938b86b03941e94fac9ee793a94452ad1365c9fc3f4f62216832", size = 4415405, upload-time = "2026-04-08T01:57:14.923Z" },
-    { url = "https://files.pythonhosted.org/packages/9a/92/4ed714dbe93a066dc1f4b4581a464d2d7dbec9046f7c8b7016f5286329e2/cryptography-46.0.7-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:5e51be372b26ef4ba3de3c167cd3d1022934bc838ae9eaad7e644986d2a3d163", size = 4272715, upload-time = "2026-04-08T01:57:16.638Z" },
-    { url = "https://files.pythonhosted.org/packages/b7/e6/a26b84096eddd51494bba19111f8fffe976f6a09f132706f8f1bf03f51f7/cryptography-46.0.7-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:cdf1a610ef82abb396451862739e3fc93b071c844399e15b90726ef7470eeaf2", size = 4918400, upload-time = "2026-04-08T01:57:19.021Z" },
-    { url = "https://files.pythonhosted.org/packages/c7/08/ffd537b605568a148543ac3c2b239708ae0bd635064bab41359252ef88ed/cryptography-46.0.7-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:1d25aee46d0c6f1a501adcddb2d2fee4b979381346a78558ed13e50aa8a59067", size = 4450634, upload-time = "2026-04-08T01:57:21.185Z" },
-    { url = "https://files.pythonhosted.org/packages/16/01/0cd51dd86ab5b9befe0d031e276510491976c3a80e9f6e31810cce46c4ad/cryptography-46.0.7-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:cdfbe22376065ffcf8be74dc9a909f032df19bc58a699456a21712d6e5eabfd0", size = 3985233, upload-time = "2026-04-08T01:57:22.862Z" },
-    { url = "https://files.pythonhosted.org/packages/92/49/819d6ed3a7d9349c2939f81b500a738cb733ab62fbecdbc1e38e83d45e12/cryptography-46.0.7-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:abad9dac36cbf55de6eb49badd4016806b3165d396f64925bf2999bcb67837ba", size = 4271955, upload-time = "2026-04-08T01:57:24.814Z" },
-    { url = "https://files.pythonhosted.org/packages/80/07/ad9b3c56ebb95ed2473d46df0847357e01583f4c52a85754d1a55e29e4d0/cryptography-46.0.7-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:935ce7e3cfdb53e3536119a542b839bb94ec1ad081013e9ab9b7cfd478b05006", size = 4879888, upload-time = "2026-04-08T01:57:26.88Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/c7/201d3d58f30c4c2bdbe9b03844c291feb77c20511cc3586daf7edc12a47b/cryptography-46.0.7-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:35719dc79d4730d30f1c2b6474bd6acda36ae2dfae1e3c16f2051f215df33ce0", size = 4449961, upload-time = "2026-04-08T01:57:29.068Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/ef/649750cbf96f3033c3c976e112265c33906f8e462291a33d77f90356548c/cryptography-46.0.7-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:7bbc6ccf49d05ac8f7d7b5e2e2c33830d4fe2061def88210a126d130d7f71a85", size = 4401696, upload-time = "2026-04-08T01:57:31.029Z" },
-    { url = "https://files.pythonhosted.org/packages/41/52/a8908dcb1a389a459a29008c29966c1d552588d4ae6d43f3a1a4512e0ebe/cryptography-46.0.7-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a1529d614f44b863a7b480c6d000fe93b59acee9c82ffa027cfadc77521a9f5e", size = 4664256, upload-time = "2026-04-08T01:57:33.144Z" },
-    { url = "https://files.pythonhosted.org/packages/4b/fa/f0ab06238e899cc3fb332623f337a7364f36f4bb3f2534c2bb95a35b132c/cryptography-46.0.7-cp38-abi3-win32.whl", hash = "sha256:f247c8c1a1fb45e12586afbb436ef21ff1e80670b2861a90353d9b025583d246", size = 3013001, upload-time = "2026-04-08T01:57:34.933Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/f1/00ce3bde3ca542d1acd8f8cfa38e446840945aa6363f9b74746394b14127/cryptography-46.0.7-cp38-abi3-win_amd64.whl", hash = "sha256:506c4ff91eff4f82bdac7633318a526b1d1309fc07ca76a3ad182cb5b686d6d3", size = 3472985, upload-time = "2026-04-08T01:57:36.714Z" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a4/ba/04b1bd4218cbc58dc90ce967106d51582371b898690f3ae0402876cc4f34/cryptography-46.0.6.tar.gz", hash = "sha256:27550628a518c5c6c903d84f637fbecf287f6cb9ced3804838a1295dc1fd0759", size = 750542, upload-time = "2026-03-25T23:34:53.396Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/47/23/9285e15e3bc57325b0a72e592921983a701efc1ee8f91c06c5f0235d86d9/cryptography-46.0.6-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:64235194bad039a10bb6d2d930ab3323baaec67e2ce36215fd0952fad0930ca8", size = 7176401, upload-time = "2026-03-25T23:33:22.096Z" },
+    { url = "https://files.pythonhosted.org/packages/60/f8/e61f8f13950ab6195b31913b42d39f0f9afc7d93f76710f299b5ec286ae6/cryptography-46.0.6-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:26031f1e5ca62fcb9d1fcb34b2b60b390d1aacaa15dc8b895a9ed00968b97b30", size = 4275275, upload-time = "2026-03-25T23:33:23.844Z" },
+    { url = "https://files.pythonhosted.org/packages/19/69/732a736d12c2631e140be2348b4ad3d226302df63ef64d30dfdb8db7ad1c/cryptography-46.0.6-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9a693028b9cbe51b5a1136232ee8f2bc242e4e19d456ded3fa7c86e43c713b4a", size = 4425320, upload-time = "2026-03-25T23:33:25.703Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/12/123be7292674abf76b21ac1fc0e1af50661f0e5b8f0ec8285faac18eb99e/cryptography-46.0.6-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:67177e8a9f421aa2d3a170c3e56eca4e0128883cf52a071a7cbf53297f18b175", size = 4278082, upload-time = "2026-03-25T23:33:27.423Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/ba/d5e27f8d68c24951b0a484924a84c7cdaed7502bac9f18601cd357f8b1d2/cryptography-46.0.6-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:d9528b535a6c4f8ff37847144b8986a9a143585f0540fbcb1a98115b543aa463", size = 4926514, upload-time = "2026-03-25T23:33:29.206Z" },
+    { url = "https://files.pythonhosted.org/packages/34/71/1ea5a7352ae516d5512d17babe7e1b87d9db5150b21f794b1377eac1edc0/cryptography-46.0.6-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:22259338084d6ae497a19bae5d4c66b7ca1387d3264d1c2c0e72d9e9b6a77b97", size = 4457766, upload-time = "2026-03-25T23:33:30.834Z" },
+    { url = "https://files.pythonhosted.org/packages/01/59/562be1e653accee4fdad92c7a2e88fced26b3fdfce144047519bbebc299e/cryptography-46.0.6-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:760997a4b950ff00d418398ad73fbc91aa2894b5c1db7ccb45b4f68b42a63b3c", size = 3986535, upload-time = "2026-03-25T23:33:33.02Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/8b/b1ebfeb788bf4624d36e45ed2662b8bd43a05ff62157093c1539c1288a18/cryptography-46.0.6-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:3dfa6567f2e9e4c5dceb8ccb5a708158a2a871052fa75c8b78cb0977063f1507", size = 4277618, upload-time = "2026-03-25T23:33:34.567Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/52/a005f8eabdb28df57c20f84c44d397a755782d6ff6d455f05baa2785bd91/cryptography-46.0.6-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:cdcd3edcbc5d55757e5f5f3d330dd00007ae463a7e7aa5bf132d1f22a4b62b19", size = 4890802, upload-time = "2026-03-25T23:33:37.034Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/4d/8e7d7245c79c617d08724e2efa397737715ca0ec830ecb3c91e547302555/cryptography-46.0.6-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:d4e4aadb7fc1f88687f47ca20bb7227981b03afaae69287029da08096853b738", size = 4457425, upload-time = "2026-03-25T23:33:38.904Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/5c/f6c3596a1430cec6f949085f0e1a970638d76f81c3ea56d93d564d04c340/cryptography-46.0.6-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:2b417edbe8877cda9022dde3a008e2deb50be9c407eef034aeeb3a8b11d9db3c", size = 4405530, upload-time = "2026-03-25T23:33:40.842Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/c9/9f9cea13ee2dbde070424e0c4f621c091a91ffcc504ffea5e74f0e1daeff/cryptography-46.0.6-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:380343e0653b1c9d7e1f55b52aaa2dbb2fdf2730088d48c43ca1c7c0abb7cc2f", size = 4667896, upload-time = "2026-03-25T23:33:42.781Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/b5/1895bc0821226f129bc74d00eccfc6a5969e2028f8617c09790bf89c185e/cryptography-46.0.6-cp311-abi3-win32.whl", hash = "sha256:bcb87663e1f7b075e48c3be3ecb5f0b46c8fc50b50a97cf264e7f60242dca3f2", size = 3026348, upload-time = "2026-03-25T23:33:45.021Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/f8/c9bcbf0d3e6ad288b9d9aa0b1dee04b063d19e8c4f871855a03ab3a297ab/cryptography-46.0.6-cp311-abi3-win_amd64.whl", hash = "sha256:6739d56300662c468fddb0e5e291f9b4d084bead381667b9e654c7dd81705124", size = 3483896, upload-time = "2026-03-25T23:33:46.649Z" },
+    { url = "https://files.pythonhosted.org/packages/01/41/3a578f7fd5c70611c0aacba52cd13cb364a5dee895a5c1d467208a9380b0/cryptography-46.0.6-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:2ef9e69886cbb137c2aef9772c2e7138dc581fad4fcbcf13cc181eb5a3ab6275", size = 7117147, upload-time = "2026-03-25T23:33:48.249Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/87/887f35a6fca9dde90cad08e0de0c89263a8e59b2d2ff904fd9fcd8025b6f/cryptography-46.0.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7f417f034f91dcec1cb6c5c35b07cdbb2ef262557f701b4ecd803ee8cefed4f4", size = 4266221, upload-time = "2026-03-25T23:33:49.874Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/a8/0a90c4f0b0871e0e3d1ed126aed101328a8a57fd9fd17f00fb67e82a51ca/cryptography-46.0.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d24c13369e856b94892a89ddf70b332e0b70ad4a5c43cf3e9cb71d6d7ffa1f7b", size = 4408952, upload-time = "2026-03-25T23:33:52.128Z" },
+    { url = "https://files.pythonhosted.org/packages/16/0b/b239701eb946523e4e9f329336e4ff32b1247e109cbab32d1a7b61da8ed7/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:aad75154a7ac9039936d50cf431719a2f8d4ed3d3c277ac03f3339ded1a5e707", size = 4270141, upload-time = "2026-03-25T23:33:54.11Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/a8/976acdd4f0f30df7b25605f4b9d3d89295351665c2091d18224f7ad5cdbf/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:3c21d92ed15e9cfc6eb64c1f5a0326db22ca9c2566ca46d845119b45b4400361", size = 4904178, upload-time = "2026-03-25T23:33:55.725Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/1b/bf0e01a88efd0e59679b69f42d4afd5bced8700bb5e80617b2d63a3741af/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:4668298aef7cddeaf5c6ecc244c2302a2b8e40f384255505c22875eebb47888b", size = 4441812, upload-time = "2026-03-25T23:33:57.364Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/8b/11df86de2ea389c65aa1806f331cae145f2ed18011f30234cc10ca253de8/cryptography-46.0.6-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:8ce35b77aaf02f3b59c90b2c8a05c73bac12cea5b4e8f3fbece1f5fddea5f0ca", size = 3963923, upload-time = "2026-03-25T23:33:59.361Z" },
+    { url = "https://files.pythonhosted.org/packages/91/e0/207fb177c3a9ef6a8108f234208c3e9e76a6aa8cf20d51932916bd43bda0/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c89eb37fae9216985d8734c1afd172ba4927f5a05cfd9bf0e4863c6d5465b013", size = 4269695, upload-time = "2026-03-25T23:34:00.909Z" },
+    { url = "https://files.pythonhosted.org/packages/21/5e/19f3260ed1e95bced52ace7501fabcd266df67077eeb382b79c81729d2d3/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:ed418c37d095aeddf5336898a132fba01091f0ac5844e3e8018506f014b6d2c4", size = 4869785, upload-time = "2026-03-25T23:34:02.796Z" },
+    { url = "https://files.pythonhosted.org/packages/10/38/cd7864d79aa1d92ef6f1a584281433419b955ad5a5ba8d1eb6c872165bcb/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:69cf0056d6947edc6e6760e5f17afe4bea06b56a9ac8a06de9d2bd6b532d4f3a", size = 4441404, upload-time = "2026-03-25T23:34:04.35Z" },
+    { url = "https://files.pythonhosted.org/packages/09/0a/4fe7a8d25fed74419f91835cf5829ade6408fd1963c9eae9c4bce390ecbb/cryptography-46.0.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8e7304c4f4e9490e11efe56af6713983460ee0780f16c63f219984dab3af9d2d", size = 4397549, upload-time = "2026-03-25T23:34:06.342Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/a0/7d738944eac6513cd60a8da98b65951f4a3b279b93479a7e8926d9cd730b/cryptography-46.0.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b928a3ca837c77a10e81a814a693f2295200adb3352395fad024559b7be7a736", size = 4651874, upload-time = "2026-03-25T23:34:07.916Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/f1/c2326781ca05208845efca38bf714f76939ae446cd492d7613808badedf1/cryptography-46.0.6-cp314-cp314t-win32.whl", hash = "sha256:97c8115b27e19e592a05c45d0dd89c57f81f841cc9880e353e0d3bf25b2139ed", size = 3001511, upload-time = "2026-03-25T23:34:09.892Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/57/fe4a23eb549ac9d903bd4698ffda13383808ef0876cc912bcb2838799ece/cryptography-46.0.6-cp314-cp314t-win_amd64.whl", hash = "sha256:c797e2517cb7880f8297e2c0f43bb910e91381339336f75d2c1c2cbf811b70b4", size = 3471692, upload-time = "2026-03-25T23:34:11.613Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/cc/f330e982852403da79008552de9906804568ae9230da8432f7496ce02b71/cryptography-46.0.6-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:12cae594e9473bca1a7aceb90536060643128bb274fcea0fc459ab90f7d1ae7a", size = 7162776, upload-time = "2026-03-25T23:34:13.308Z" },
+    { url = "https://files.pythonhosted.org/packages/49/b3/dc27efd8dcc4bff583b3f01d4a3943cd8b5821777a58b3a6a5f054d61b79/cryptography-46.0.6-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:639301950939d844a9e1c4464d7e07f902fe9a7f6b215bb0d4f28584729935d8", size = 4270529, upload-time = "2026-03-25T23:34:15.019Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/05/e8d0e6eb4f0d83365b3cb0e00eb3c484f7348db0266652ccd84632a3d58d/cryptography-46.0.6-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ed3775295fb91f70b4027aeba878d79b3e55c0b3e97eaa4de71f8f23a9f2eb77", size = 4414827, upload-time = "2026-03-25T23:34:16.604Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/97/daba0f5d2dc6d855e2dcb70733c812558a7977a55dd4a6722756628c44d1/cryptography-46.0.6-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8927ccfbe967c7df312ade694f987e7e9e22b2425976ddbf28271d7e58845290", size = 4271265, upload-time = "2026-03-25T23:34:18.586Z" },
+    { url = "https://files.pythonhosted.org/packages/89/06/fe1fce39a37ac452e58d04b43b0855261dac320a2ebf8f5260dd55b201a9/cryptography-46.0.6-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:b12c6b1e1651e42ab5de8b1e00dc3b6354fdfd778e7fa60541ddacc27cd21410", size = 4916800, upload-time = "2026-03-25T23:34:20.561Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/8a/b14f3101fe9c3592603339eb5d94046c3ce5f7fc76d6512a2d40efd9724e/cryptography-46.0.6-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:063b67749f338ca9c5a0b7fe438a52c25f9526b851e24e6c9310e7195aad3b4d", size = 4448771, upload-time = "2026-03-25T23:34:22.406Z" },
+    { url = "https://files.pythonhosted.org/packages/01/b3/0796998056a66d1973fd52ee89dc1bb3b6581960a91ad4ac705f182d398f/cryptography-46.0.6-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:02fad249cb0e090b574e30b276a3da6a149e04ee2f049725b1f69e7b8351ec70", size = 3978333, upload-time = "2026-03-25T23:34:24.281Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/3d/db200af5a4ffd08918cd55c08399dc6c9c50b0bc72c00a3246e099d3a849/cryptography-46.0.6-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:7e6142674f2a9291463e5e150090b95a8519b2fb6e6aaec8917dd8d094ce750d", size = 4271069, upload-time = "2026-03-25T23:34:25.895Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/18/61acfd5b414309d74ee838be321c636fe71815436f53c9f0334bf19064fa/cryptography-46.0.6-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:456b3215172aeefb9284550b162801d62f5f264a081049a3e94307fe20792cfa", size = 4878358, upload-time = "2026-03-25T23:34:27.67Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/65/5bf43286d566f8171917cae23ac6add941654ccf085d739195a4eacf1674/cryptography-46.0.6-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:341359d6c9e68834e204ceaf25936dffeafea3829ab80e9503860dcc4f4dac58", size = 4448061, upload-time = "2026-03-25T23:34:29.375Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/25/7e49c0fa7205cf3597e525d156a6bce5b5c9de1fd7e8cb01120e459f205a/cryptography-46.0.6-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9a9c42a2723999a710445bc0d974e345c32adfd8d2fac6d8a251fa829ad31cfb", size = 4399103, upload-time = "2026-03-25T23:34:32.036Z" },
+    { url = "https://files.pythonhosted.org/packages/44/46/466269e833f1c4718d6cd496ffe20c56c9c8d013486ff66b4f69c302a68d/cryptography-46.0.6-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6617f67b1606dfd9fe4dbfa354a9508d4a6d37afe30306fe6c101b7ce3274b72", size = 4659255, upload-time = "2026-03-25T23:34:33.679Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/09/ddc5f630cc32287d2c953fc5d32705e63ec73e37308e5120955316f53827/cryptography-46.0.6-cp38-abi3-win32.whl", hash = "sha256:7f6690b6c55e9c5332c0b59b9c8a3fb232ebf059094c17f9019a51e9827df91c", size = 3010660, upload-time = "2026-03-25T23:34:35.418Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/82/ca4893968aeb2709aacfb57a30dec6fa2ab25b10fa9f064b8882ce33f599/cryptography-46.0.6-cp38-abi3-win_amd64.whl", hash = "sha256:79e865c642cfc5c0b3eb12af83c35c5aeff4fa5c672dc28c43721c2c9fdd2f0f", size = 3471160, upload-time = "2026-03-25T23:34:37.191Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/84/7ccff00ced5bac74b775ce0beb7d1be4e8637536b522b5df9b73ada42da2/cryptography-46.0.6-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:2ea0f37e9a9cf0df2952893ad145fd9627d326a59daec9b0802480fa3bcd2ead", size = 3475444, upload-time = "2026-03-25T23:34:38.944Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/1f/4c926f50df7749f000f20eede0c896769509895e2648db5da0ed55db711d/cryptography-46.0.6-pp311-pypy311_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a3e84d5ec9ba01f8fd03802b2147ba77f0c8f2617b2aff254cedd551844209c8", size = 4218227, upload-time = "2026-03-25T23:34:40.871Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/65/707be3ffbd5f786028665c3223e86e11c4cda86023adbc56bd72b1b6bab5/cryptography-46.0.6-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:12f0fa16cc247b13c43d56d7b35287ff1569b5b1f4c5e87e92cc4fcc00cd10c0", size = 4381399, upload-time = "2026-03-25T23:34:42.609Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/6d/73557ed0ef7d73d04d9aba745d2c8e95218213687ee5e76b7d236a5030fc/cryptography-46.0.6-pp311-pypy311_pp73-manylinux_2_34_aarch64.whl", hash = "sha256:50575a76e2951fe7dbd1f56d181f8c5ceeeb075e9ff88e7ad997d2f42af06e7b", size = 4217595, upload-time = "2026-03-25T23:34:44.205Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/c5/e1594c4eec66a567c3ac4400008108a415808be2ce13dcb9a9045c92f1a0/cryptography-46.0.6-pp311-pypy311_pp73-manylinux_2_34_x86_64.whl", hash = "sha256:90e5f0a7b3be5f40c3a0a0eafb32c681d8d2c181fc2a1bdabe9b3f611d9f6b1a", size = 4380912, upload-time = "2026-03-25T23:34:46.328Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/89/843b53614b47f97fe1abc13f9a86efa5ec9e275292c457af1d4a60dc80e0/cryptography-46.0.6-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:6728c49e3b2c180ef26f8e9f0a883a2c585638db64cf265b49c9ba10652d430e", size = 3409955, upload-time = "2026-03-25T23:34:48.465Z" },
 ]
 
 [[package]]
@@ -965,6 +1216,12 @@ dependencies = [
     { name = "cuda-pathfinder" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/1a/fe/7351d7e586a8b4c9f89731bfe4cf0148223e8f9903ff09571f78b3fb0682/cuda_bindings-13.2.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:08b395f79cb89ce0cd8effff07c4a1e20101b873c256a1aeb286e8fd7bd0f556", size = 5744254, upload-time = "2026-03-11T00:12:29.798Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/ef/184aa775e970fc089942cd9ec6302e6e44679d4c14549c6a7ea45bf7f798/cuda_bindings-13.2.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6f3682ec3c4769326aafc67c2ba669d97d688d0b7e63e659d36d2f8b72f32d6", size = 6329075, upload-time = "2026-03-11T00:12:32.319Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/ea/81999d01375645f34596c76eb046b4b36d58cc6fe2bddb2410f8a7b7a827/cuda_bindings-13.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:845025438a1b9e20718b9fb42add3e0eb72e85458bcab3eeb80bfd8f0a9dab33", size = 5600047, upload-time = "2026-03-11T00:12:34.848Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/a9/3a8241c6e19483ac1f1dcf5c10238205dcb8a6e9d0d4d4709240dff28ff4/cuda_bindings-13.2.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:721104c603f059780d287969be3d194a18d0cc3b713ed9049065a1107706759d", size = 5730273, upload-time = "2026-03-11T00:12:37.18Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/94/2748597f47bb1600cd466b20cab4159f1530a3a33fe7f70fee199b3abb9e/cuda_bindings-13.2.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1eba9504ac70667dd48313395fe05157518fd6371b532790e96fbb31bbb5a5e1", size = 6313924, upload-time = "2026-03-11T00:12:39.462Z" },
+    { url = "https://files.pythonhosted.org/packages/29/5a/0ce1731c48bcd9f40996a4ef1abbf634f1a7fe4a15c5050b1e75ce3a7acf/cuda_bindings-13.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:debb51b211d246f8326f6b6e982506a5d0d9906672c91bc478b66addc7ecc60a", size = 5631363, upload-time = "2026-03-11T00:12:41.58Z" },
     { url = "https://files.pythonhosted.org/packages/52/c8/b2589d68acf7e3d63e2be330b84bc25712e97ed799affbca7edd7eae25d6/cuda_bindings-13.2.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e865447abfb83d6a98ad5130ed3c70b1fc295ae3eeee39fd07b4ddb0671b6788", size = 5722404, upload-time = "2026-03-11T00:12:44.041Z" },
     { url = "https://files.pythonhosted.org/packages/1f/92/f899f7bbb5617bb65ec52a6eac1e9a1447a86b916c4194f8a5001b8cde0c/cuda_bindings-13.2.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:46d8776a55d6d5da9dd6e9858fba2efcda2abe6743871dee47dd06eb8cb6d955", size = 6320619, upload-time = "2026-03-11T00:12:45.939Z" },
     { url = "https://files.pythonhosted.org/packages/bb/a5/d7f01a415e134546248cef612adad8153c9f1eb10ec79505a7cd8294370b/cuda_bindings-13.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:45815daeb595bf3b405c52671a2542b1f8e9329f3b029494acbfcc74aeaa1f2d", size = 5840830, upload-time = "2026-03-11T00:12:48.43Z" },
@@ -981,10 +1238,10 @@ wheels = [
 
 [[package]]
 name = "cuda-pathfinder"
-version = "1.5.3"
+version = "1.5.1"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d3/d6/ac63065d33dd700fee7ebd7d287332401b54e31b9346e142f871e1f0b116/cuda_pathfinder-1.5.3-py3-none-any.whl", hash = "sha256:dff021123aedbb4117cc7ec81717bbfe198fb4e8b5f1ee57e0e084fec5c8577d", size = 49991, upload-time = "2026-04-14T20:09:27.037Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/74/8c66861b873d8eed51fde56d3091baa4906a56f0d4390cae991f2d41dda5/cuda_pathfinder-1.5.1-py3-none-any.whl", hash = "sha256:b3718097fb57cf9e8a904dd072d806f2c9a27627e35c020b06ab9454bcec08c0", size = 49861, upload-time = "2026-04-03T16:41:22.203Z" },
 ]
 
 [[package]]
@@ -1009,37 +1266,37 @@ wheels = [
 
 [package.optional-dependencies]
 cublas = [
-    { name = "nvidia-cublas", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 cudart = [
-    { name = "nvidia-cuda-runtime", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cuda-runtime", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 cufft = [
-    { name = "nvidia-cufft", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cufft", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 cufile = [
     { name = "nvidia-cufile", marker = "sys_platform == 'linux'" },
 ]
 cupti = [
-    { name = "nvidia-cuda-cupti", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cuda-cupti", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 curand = [
-    { name = "nvidia-curand", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-curand", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 cusolver = [
-    { name = "nvidia-cusolver", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cusolver", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 cusparse = [
-    { name = "nvidia-cusparse", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cusparse", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 nvjitlink = [
-    { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 nvrtc = [
-    { name = "nvidia-cuda-nvrtc", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-cuda-nvrtc", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 nvtx = [
-    { name = "nvidia-nvtx", marker = "sys_platform == 'linux' or sys_platform == 'win32'" },
+    { name = "nvidia-nvtx", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -1048,6 +1305,14 @@ version = "3.2.4"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/91/85/7574c9cd44b69a27210444b6650f6477f56c75fee1b70d7672d3e4166167/cython-3.2.4.tar.gz", hash = "sha256:84226ecd313b233da27dc2eb3601b4f222b8209c3a7216d8733b031da1dc64e6", size = 3280291, upload-time = "2026-01-04T14:14:14.473Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/a1/10/720e0fb84eab4c927c4dd6b61eb7993f7732dd83d29ba6d73083874eade9/cython-3.2.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02cb0cc0f23b9874ad262d7d2b9560aed9c7e2df07b49b920bda6f2cc9cb505e", size = 2960836, upload-time = "2026-01-04T14:14:51.103Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/3d/b26f29092c71c36e0462752885bdfb18c23c176af4de953fdae2772a8941/cython-3.2.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f136f379a4a54246facd0eb6f1ee15c3837cb314ce87b677582ec014db4c6845", size = 3370134, upload-time = "2026-01-04T14:14:53.627Z" },
+    { url = "https://files.pythonhosted.org/packages/56/9e/539fb0d09e4f5251b5b14f8daf77e71fee021527f1013791038234618b6b/cython-3.2.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:35ab0632186057406ec729374c737c37051d2eacad9d515d94e5a3b3e58a9b02", size = 3537552, upload-time = "2026-01-04T14:14:56.852Z" },
+    { url = "https://files.pythonhosted.org/packages/10/c6/82d19a451c050d1be0f05b1a3302267463d391db548f013ee88b5348a8e9/cython-3.2.4-cp310-cp310-win_amd64.whl", hash = "sha256:ca2399dc75796b785f74fb85c938254fa10c80272004d573c455f9123eceed86", size = 2766191, upload-time = "2026-01-04T14:14:58.709Z" },
+    { url = "https://files.pythonhosted.org/packages/85/cc/8f06145ec3efa121c8b1b67f06a640386ddacd77ee3e574da582a21b14ee/cython-3.2.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff9af2134c05e3734064808db95b4dd7341a39af06e8945d05ea358e1741aaed", size = 2953769, upload-time = "2026-01-04T14:15:00.361Z" },
+    { url = "https://files.pythonhosted.org/packages/55/b0/706cf830eddd831666208af1b3058c2e0758ae157590909c1f634b53bed9/cython-3.2.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67922c9de058a0bfb72d2e75222c52d09395614108c68a76d9800f150296ddb3", size = 3243841, upload-time = "2026-01-04T14:15:02.066Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/25/58893afd4ef45f79e3d4db82742fa4ff874b936d67a83c92939053920ccd/cython-3.2.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b362819d155fff1482575e804e43e3a8825332d32baa15245f4642022664a3f4", size = 3378083, upload-time = "2026-01-04T14:15:04.248Z" },
+    { url = "https://files.pythonhosted.org/packages/32/e4/424a004d7c0d8a4050c81846ebbd22272ececfa9a498cb340aa44fccbec2/cython-3.2.4-cp311-cp311-win_amd64.whl", hash = "sha256:1a64a112a34ec719b47c01395647e54fb4cf088a511613f9a3a5196694e8e382", size = 2769990, upload-time = "2026-01-04T14:15:06.53Z" },
     { url = "https://files.pythonhosted.org/packages/91/4d/1eb0c7c196a136b1926f4d7f0492a96c6fabd604d77e6cd43b56a3a16d83/cython-3.2.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:64d7f71be3dd6d6d4a4c575bb3a4674ea06d1e1e5e4cd1b9882a2bc40ed3c4c9", size = 2970064, upload-time = "2026-01-04T14:15:08.567Z" },
     { url = "https://files.pythonhosted.org/packages/03/1c/46e34b08bea19a1cdd1e938a4c123e6299241074642db9d81983cef95e9f/cython-3.2.4-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:869487ea41d004f8b92171f42271fbfadb1ec03bede3158705d16cd570d6b891", size = 3226757, upload-time = "2026-01-04T14:15:10.812Z" },
     { url = "https://files.pythonhosted.org/packages/12/33/3298a44d201c45bcf0d769659725ae70e9c6c42adf8032f6d89c8241098d/cython-3.2.4-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:55b6c44cd30821f0b25220ceba6fe636ede48981d2a41b9bbfe3c7902ce44ea7", size = 3388969, upload-time = "2026-01-04T14:15:12.45Z" },
@@ -1072,77 +1337,27 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ff/fa/d3c15189f7c52aaefbaea76fb012119b04b9013f4bf446cb4eb4c26c4e6b/cython-3.2.4-py3-none-any.whl", hash = "sha256:732fc93bc33ae4b14f6afaca663b916c2fdd5dcbfad7114e17fb2434eeaea45c", size = 1257078, upload-time = "2026-01-04T14:14:12.373Z" },
 ]
 
-[[package]]
-name = "datasets"
-version = "2.2.1"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32'",
-]
-dependencies = [
-    { name = "aiohttp", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "dill", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "fsspec", version = "2026.3.0", source = { registry = "https://pypi.org/simple" }, extra = ["http"], marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "huggingface-hub", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "multiprocess", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "pandas", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "pyarrow", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "requests", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "responses", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "tqdm", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "xxhash", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/31/64/1e6fb2a0eb6b0d55117233cf33279ba6d680c0f031ebae81281a47c92760/datasets-2.2.1.tar.gz", hash = "sha256:d362717c4394589b516c8f397ff20a6fe720454aed877ab61d06f3bc05df9544", size = 302132, upload-time = "2022-05-11T17:02:29.543Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d7/2d/41e8aec8d4bad6f07adfcbc89cf743e0d31c876371d453b2936bcfa7fe34/datasets-2.2.1-py3-none-any.whl", hash = "sha256:1938f3e99599422de50b9b54fe802aca854ed130382dab0b3820c821f7ae6d5e", size = 342193, upload-time = "2022-05-11T17:02:27.047Z" },
-]
-
 [[package]]
 name = "datasets"
 version = "4.8.4"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-]
-dependencies = [
-    { name = "dill", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "filelock", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, extra = ["http"], marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "httpx", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "huggingface-hub", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "multiprocess", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "pandas", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "pyarrow", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "pyyaml", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "requests", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "tqdm", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "xxhash", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
+dependencies = [
+    { name = "dill" },
+    { name = "filelock" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, extra = ["http"], marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "httpx" },
+    { name = "huggingface-hub" },
+    { name = "multiprocess" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging" },
+    { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "pandas", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pyarrow" },
+    { name = "pyyaml" },
+    { name = "requests" },
+    { name = "tqdm" },
+    { name = "xxhash" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/22/22/73e46ac7a8c25e7ef0b3bd6f10da3465021d90219a32eb0b4d2afea4c56e/datasets-4.8.4.tar.gz", hash = "sha256:a1429ed853275ce7943a01c6d2e25475b4501eb758934362106a280470df3a52", size = 604382, upload-time = "2026-03-23T14:21:17.987Z" }
 wheels = [
@@ -1222,17 +1437,95 @@ wheels = [
 
 [[package]]
 name = "docstring-parser"
-version = "0.18.0"
+version = "0.17.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/e0/4d/f332313098c1de1b2d2ff91cf2674415cc7cddab2ca1b01ae29774bd5fdf/docstring_parser-0.18.0.tar.gz", hash = "sha256:292510982205c12b1248696f44959db3cdd1740237a968ea1e2e7a900eeb2015", size = 29341, upload-time = "2026-04-14T04:09:19.867Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b2/9d/c3b43da9515bd270df0f80548d9944e389870713cc1fe2b8fb35fe2bcefd/docstring_parser-0.17.0.tar.gz", hash = "sha256:583de4a309722b3315439bb31d64ba3eebada841f2e2cee23b99df001434c912", size = 27442, upload-time = "2025-07-21T07:35:01.868Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/5f/ed01f9a3cdffbd5a008556fc7b2a08ddb1cc6ace7effa7340604b1d16699/docstring_parser-0.18.0-py3-none-any.whl", hash = "sha256:b3fcbed555c47d8479be0796ef7e19c2670d428d72e96da63f3a40122860374b", size = 22484, upload-time = "2026-04-14T04:09:18.638Z" },
+    { url = "https://files.pythonhosted.org/packages/55/e2/2537ebcff11c1ee1ff17d8d0b6f4db75873e3b0fb32c2d4a2ee31ecb310a/docstring_parser-0.17.0-py3-none-any.whl", hash = "sha256:cf2569abd23dce8099b300f9b4fa8191e9582dda731fd533daf54c4551658708", size = 36896, upload-time = "2025-07-21T07:35:00.684Z" },
+]
+
+[[package]]
+name = "docutils"
+version = "0.21.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ae/ed/aefcc8cd0ba62a0560c3c18c33925362d46c6075480bfa4df87b28e169a9/docutils-0.21.2.tar.gz", hash = "sha256:3a6b18732edf182daa3cd12775bbb338cf5691468f91eeeb109deff6ebfa986f", size = 2204444, upload-time = "2024-04-23T18:57:18.24Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8f/d7/9322c609343d929e75e7e5e6255e614fcc67572cfd083959cdef3b7aad79/docutils-0.21.2-py3-none-any.whl", hash = "sha256:dafca5b9e384f0e419294eb4d2ff9fa826435bf15f15b7bd45723e8ad76811b2", size = 587408, upload-time = "2024-04-23T18:57:14.835Z" },
 ]
 
 [[package]]
 name = "docutils"
 version = "0.22.4"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/ae/b6/03bb70946330e88ffec97aefd3ea75ba575cb2e762061e0e62a213befee8/docutils-0.22.4.tar.gz", hash = "sha256:4db53b1fde9abecbb74d91230d32ab626d94f6badfc575d6db9194a49df29968", size = 2291750, upload-time = "2025-12-18T19:00:26.443Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/02/10/5da547df7a391dcde17f59520a231527b8571e6f46fc8efb02ccb370ab12/docutils-0.22.4-py3-none-any.whl", hash = "sha256:d0013f540772d1420576855455d050a2180186c91c15779301ac2ccb3eeb68de", size = 633196, upload-time = "2025-12-18T19:00:18.077Z" },
@@ -1271,8 +1564,8 @@ name = "emerging-optimizers"
 version = "0.2.0"
 source = { git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.2.0#1effa026ff096b7fa1063ca2fba19d98be6e6cdf" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "torch", marker = "sys_platform == 'never'" },
+    { name = "absl-py", marker = "python_full_version >= '3.12'" },
+    { name = "torch", marker = "python_full_version >= '3.12' and sys_platform == 'never'" },
 ]
 
 [[package]]
@@ -1280,7 +1573,7 @@ name = "exceptiongroup"
 version = "1.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "typing-extensions", marker = "python_full_version < '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/50/79/66800aadf48771f6b62f7eb014e352e5d06856655206165d775e675a02c9/exceptiongroup-1.3.1.tar.gz", hash = "sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219", size = 30371, upload-time = "2025-11-21T23:01:54.787Z" }
 wheels = [
@@ -1302,9 +1595,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/37/f9/f8497ef8b873a8bb2a750ee2a6c5f0fc22258e1acb6245fd237042a6c279/fabric-3.2.3-py3-none-any.whl", hash = "sha256:ce61917f4f398018337ce279b357650a3a74baecf3fdd53a5839013944af965e", size = 59502, upload-time = "2026-04-06T00:00:10.176Z" },
 ]
 
+[[package]]
+name = "fast-hadamard-transform"
+version = "1.0.4.post1"
+source = { git = "https://github.com/Dao-AILab/fast-hadamard-transform.git?rev=f134af63deb2df17e1171a9ec1ea4a7d8604d5ca#f134af63deb2df17e1171a9ec1ea4a7d8604d5ca" }
+dependencies = [
+    { name = "ninja" },
+    { name = "packaging" },
+    { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+
 [[package]]
 name = "fastapi"
-version = "0.136.0"
+version = "0.135.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-doc" },
@@ -1313,9 +1616,9 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "typing-inspection" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/4e/d9/e66315807e41e69e7f6a1b42a162dada2f249c5f06ad3f1a95f84ab336ef/fastapi-0.136.0.tar.gz", hash = "sha256:cf08e067cc66e106e102d9ba659463abfac245200752f8a5b7b1e813de4ff73e", size = 396607, upload-time = "2026-04-16T11:47:13.623Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f7/e6/7adb4c5fa231e82c35b8f5741a9f2d055f520c29af5546fd70d3e8e1cd2e/fastapi-0.135.3.tar.gz", hash = "sha256:bd6d7caf1a2bdd8d676843cdcd2287729572a1ef524fc4d65c17ae002a1be654", size = 396524, upload-time = "2026-04-01T16:23:58.188Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/26/a3/0bd5f0cdb0bbc92650e8dc457e9250358411ee5d1b65e42b6632387daf81/fastapi-0.136.0-py3-none-any.whl", hash = "sha256:8793d44ec7378e2be07f8a013cf7f7aa47d6327d0dfe9804862688ec4541a6b4", size = 117556, upload-time = "2026-04-16T11:47:11.922Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a4/5caa2de7f917a04ada20018eccf60d6cc6145b0199d55ca3711b0fc08312/fastapi-0.135.3-py3-none-any.whl", hash = "sha256:9b0f590c813acd13d0ab43dd8494138eb58e484bfac405db1f3187cfc5810d98", size = 117734, upload-time = "2026-04-01T16:23:59.328Z" },
 ]
 
 [[package]]
@@ -1335,11 +1638,11 @@ wheels = [
 
 [[package]]
 name = "filelock"
-version = "3.29.0"
+version = "3.25.2"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/b5/fe/997687a931ab51049acce6fa1f23e8f01216374ea81374ddee763c493db5/filelock-3.29.0.tar.gz", hash = "sha256:69974355e960702e789734cb4871f884ea6fe50bd8404051a3530bc07809cf90", size = 57571, upload-time = "2026-04-19T15:39:10.068Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/94/b8/00651a0f559862f3bb7d6f7477b192afe3f583cc5e26403b44e59a55ab34/filelock-3.25.2.tar.gz", hash = "sha256:b64ece2b38f4ca29dd3e810287aa8c48182bbecd1ae6e9ae126c9b35f1382694", size = 40480, upload-time = "2026-03-11T20:45:38.487Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/81/47/dd9a212ef6e343a6857485ffe25bba537304f1913bdbed446a23f7f592e1/filelock-3.29.0-py3-none-any.whl", hash = "sha256:96f5f6344709aa1572bbf631c640e4ebeeb519e08da902c39a001882f30ac258", size = 39812, upload-time = "2026-04-19T15:39:08.752Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/a5/842ae8f0c08b61d6484b52f99a03510a3a72d23141942d216ebe81fefbce/filelock-3.25.2-py3-none-any.whl", hash = "sha256:ca8afb0da15f229774c9ad1b455ed96e85a81373065fb10446672f64444ddf70", size = 26759, upload-time = "2026-03-11T20:45:37.437Z" },
 ]
 
 [[package]]
@@ -1391,11 +1694,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/60/ee/a3cba17965482b35c4990af90bad108e82c32edcb59911c37f318b5f4198/flash_linear_attention-0.4.2-py3-none-any.whl", hash = "sha256:c08be006ce4dbe1be81f54938ee8e6fc7968cfba397c8d06c7669e97b8c44c0d", size = 284661, upload-time = "2026-03-12T14:45:44.905Z" },
 ]
 
-[[package]]
-name = "flash-mla"
-version = "1.0.0+9edee0c"
-source = { git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19#9edee0c022cd0938148a18e334203b0aab43aa19" }
-
 [[package]]
 name = "flashinfer-python"
 version = "0.5.3"
@@ -1405,13 +1703,12 @@ dependencies = [
     { name = "click" },
     { name = "einops" },
     { name = "ninja" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-cudnn-frontend" },
     { name = "nvidia-cutlass-dsl" },
     { name = "nvidia-ml-py" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "packaging" },
     { name = "requests" },
     { name = "tabulate" },
     { name = "torch", marker = "sys_platform == 'never'" },
@@ -1460,6 +1757,38 @@ version = "1.8.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/2d/f5/c831fac6cc817d26fd54c7eaccd04ef7e0288806943f7cc5bbf69f3ac1f0/frozenlist-1.8.0.tar.gz", hash = "sha256:3ede829ed8d842f6cd48fc7081d7a41001a56f1f38603f9d49bf3020d59a31ad", size = 45875, upload-time = "2025-10-06T05:38:17.865Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/83/4a/557715d5047da48d54e659203b9335be7bfaafda2c3f627b7c47e0b3aaf3/frozenlist-1.8.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b37f6d31b3dcea7deb5e9696e529a6aa4a898adc33db82da12e4c60a7c4d2011", size = 86230, upload-time = "2025-10-06T05:35:23.699Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/fb/c85f9fed3ea8fe8740e5b46a59cc141c23b842eca617da8876cfce5f760e/frozenlist-1.8.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ef2b7b394f208233e471abc541cc6991f907ffd47dc72584acee3147899d6565", size = 49621, upload-time = "2025-10-06T05:35:25.341Z" },
+    { url = "https://files.pythonhosted.org/packages/63/70/26ca3f06aace16f2352796b08704338d74b6d1a24ca38f2771afbb7ed915/frozenlist-1.8.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a88f062f072d1589b7b46e951698950e7da00442fc1cacbe17e19e025dc327ad", size = 49889, upload-time = "2025-10-06T05:35:26.797Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/ed/c7895fd2fde7f3ee70d248175f9b6cdf792fb741ab92dc59cd9ef3bd241b/frozenlist-1.8.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f57fb59d9f385710aa7060e89410aeb5058b99e62f4d16b08b91986b9a2140c2", size = 219464, upload-time = "2025-10-06T05:35:28.254Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/83/4d587dccbfca74cb8b810472392ad62bfa100bf8108c7223eb4c4fa2f7b3/frozenlist-1.8.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:799345ab092bee59f01a915620b5d014698547afd011e691a208637312db9186", size = 221649, upload-time = "2025-10-06T05:35:29.454Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/c6/fd3b9cd046ec5fff9dab66831083bc2077006a874a2d3d9247dea93ddf7e/frozenlist-1.8.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:c23c3ff005322a6e16f71bf8692fcf4d5a304aaafe1e262c98c6d4adc7be863e", size = 219188, upload-time = "2025-10-06T05:35:30.951Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/80/6693f55eb2e085fc8afb28cf611448fb5b90e98e068fa1d1b8d8e66e5c7d/frozenlist-1.8.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8a76ea0f0b9dfa06f254ee06053d93a600865b3274358ca48a352ce4f0798450", size = 231748, upload-time = "2025-10-06T05:35:32.101Z" },
+    { url = "https://files.pythonhosted.org/packages/97/d6/e9459f7c5183854abd989ba384fe0cc1a0fb795a83c033f0571ec5933ca4/frozenlist-1.8.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c7366fe1418a6133d5aa824ee53d406550110984de7637d65a178010f759c6ef", size = 236351, upload-time = "2025-10-06T05:35:33.834Z" },
+    { url = "https://files.pythonhosted.org/packages/97/92/24e97474b65c0262e9ecd076e826bfd1d3074adcc165a256e42e7b8a7249/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:13d23a45c4cebade99340c4165bd90eeb4a56c6d8a9d8aa49568cac19a6d0dc4", size = 218767, upload-time = "2025-10-06T05:35:35.205Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/bf/dc394a097508f15abff383c5108cb8ad880d1f64a725ed3b90d5c2fbf0bb/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:e4a3408834f65da56c83528fb52ce7911484f0d1eaf7b761fc66001db1646eff", size = 235887, upload-time = "2025-10-06T05:35:36.354Z" },
+    { url = "https://files.pythonhosted.org/packages/40/90/25b201b9c015dbc999a5baf475a257010471a1fa8c200c843fd4abbee725/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:42145cd2748ca39f32801dad54aeea10039da6f86e303659db90db1c4b614c8c", size = 228785, upload-time = "2025-10-06T05:35:37.949Z" },
+    { url = "https://files.pythonhosted.org/packages/84/f4/b5bc148df03082f05d2dd30c089e269acdbe251ac9a9cf4e727b2dbb8a3d/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e2de870d16a7a53901e41b64ffdf26f2fbb8917b3e6ebf398098d72c5b20bd7f", size = 230312, upload-time = "2025-10-06T05:35:39.178Z" },
+    { url = "https://files.pythonhosted.org/packages/db/4b/87e95b5d15097c302430e647136b7d7ab2398a702390cf4c8601975709e7/frozenlist-1.8.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:20e63c9493d33ee48536600d1a5c95eefc870cd71e7ab037763d1fbb89cc51e7", size = 217650, upload-time = "2025-10-06T05:35:40.377Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/70/78a0315d1fea97120591a83e0acd644da638c872f142fd72a6cebee825f3/frozenlist-1.8.0-cp310-cp310-win32.whl", hash = "sha256:adbeebaebae3526afc3c96fad434367cafbfd1b25d72369a9e5858453b1bb71a", size = 39659, upload-time = "2025-10-06T05:35:41.863Z" },
+    { url = "https://files.pythonhosted.org/packages/66/aa/3f04523fb189a00e147e60c5b2205126118f216b0aa908035c45336e27e4/frozenlist-1.8.0-cp310-cp310-win_amd64.whl", hash = "sha256:667c3777ca571e5dbeb76f331562ff98b957431df140b54c85fd4d52eea8d8f6", size = 43837, upload-time = "2025-10-06T05:35:43.205Z" },
+    { url = "https://files.pythonhosted.org/packages/39/75/1135feecdd7c336938bd55b4dc3b0dfc46d85b9be12ef2628574b28de776/frozenlist-1.8.0-cp310-cp310-win_arm64.whl", hash = "sha256:80f85f0a7cc86e7a54c46d99c9e1318ff01f4687c172ede30fd52d19d1da1c8e", size = 39989, upload-time = "2025-10-06T05:35:44.596Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/03/077f869d540370db12165c0aa51640a873fb661d8b315d1d4d67b284d7ac/frozenlist-1.8.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:09474e9831bc2b2199fad6da3c14c7b0fbdd377cce9d3d77131be28906cb7d84", size = 86912, upload-time = "2025-10-06T05:35:45.98Z" },
+    { url = "https://files.pythonhosted.org/packages/df/b5/7610b6bd13e4ae77b96ba85abea1c8cb249683217ef09ac9e0ae93f25a91/frozenlist-1.8.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:17c883ab0ab67200b5f964d2b9ed6b00971917d5d8a92df149dc2c9779208ee9", size = 50046, upload-time = "2025-10-06T05:35:47.009Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/ef/0e8f1fe32f8a53dd26bdd1f9347efe0778b0fddf62789ea683f4cc7d787d/frozenlist-1.8.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fa47e444b8ba08fffd1c18e8cdb9a75db1b6a27f17507522834ad13ed5922b93", size = 50119, upload-time = "2025-10-06T05:35:48.38Z" },
+    { url = "https://files.pythonhosted.org/packages/11/b1/71a477adc7c36e5fb628245dfbdea2166feae310757dea848d02bd0689fd/frozenlist-1.8.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2552f44204b744fba866e573be4c1f9048d6a324dfe14475103fd51613eb1d1f", size = 231067, upload-time = "2025-10-06T05:35:49.97Z" },
+    { url = "https://files.pythonhosted.org/packages/45/7e/afe40eca3a2dc19b9904c0f5d7edfe82b5304cb831391edec0ac04af94c2/frozenlist-1.8.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:957e7c38f250991e48a9a73e6423db1bb9dd14e722a10f6b8bb8e16a0f55f695", size = 233160, upload-time = "2025-10-06T05:35:51.729Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/aa/7416eac95603ce428679d273255ffc7c998d4132cfae200103f164b108aa/frozenlist-1.8.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:8585e3bb2cdea02fc88ffa245069c36555557ad3609e83be0ec71f54fd4abb52", size = 228544, upload-time = "2025-10-06T05:35:53.246Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/3d/2a2d1f683d55ac7e3875e4263d28410063e738384d3adc294f5ff3d7105e/frozenlist-1.8.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:edee74874ce20a373d62dc28b0b18b93f645633c2943fd90ee9d898550770581", size = 243797, upload-time = "2025-10-06T05:35:54.497Z" },
+    { url = "https://files.pythonhosted.org/packages/78/1e/2d5565b589e580c296d3bb54da08d206e797d941a83a6fdea42af23be79c/frozenlist-1.8.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c9a63152fe95756b85f31186bddf42e4c02c6321207fd6601a1c89ebac4fe567", size = 247923, upload-time = "2025-10-06T05:35:55.861Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/c3/65872fcf1d326a7f101ad4d86285c403c87be7d832b7470b77f6d2ed5ddc/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b6db2185db9be0a04fecf2f241c70b63b1a242e2805be291855078f2b404dd6b", size = 230886, upload-time = "2025-10-06T05:35:57.399Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/76/ac9ced601d62f6956f03cc794f9e04c81719509f85255abf96e2510f4265/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:f4be2e3d8bc8aabd566f8d5b8ba7ecc09249d74ba3c9ed52e54dc23a293f0b92", size = 245731, upload-time = "2025-10-06T05:35:58.563Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/49/ecccb5f2598daf0b4a1415497eba4c33c1e8ce07495eb07d2860c731b8d5/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:c8d1634419f39ea6f5c427ea2f90ca85126b54b50837f31497f3bf38266e853d", size = 241544, upload-time = "2025-10-06T05:35:59.719Z" },
+    { url = "https://files.pythonhosted.org/packages/53/4b/ddf24113323c0bbcc54cb38c8b8916f1da7165e07b8e24a717b4a12cbf10/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:1a7fa382a4a223773ed64242dbe1c9c326ec09457e6b8428efb4118c685c3dfd", size = 241806, upload-time = "2025-10-06T05:36:00.959Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/fb/9b9a084d73c67175484ba2789a59f8eebebd0827d186a8102005ce41e1ba/frozenlist-1.8.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:11847b53d722050808926e785df837353bd4d75f1d494377e59b23594d834967", size = 229382, upload-time = "2025-10-06T05:36:02.22Z" },
+    { url = "https://files.pythonhosted.org/packages/95/a3/c8fb25aac55bf5e12dae5c5aa6a98f85d436c1dc658f21c3ac73f9fa95e5/frozenlist-1.8.0-cp311-cp311-win32.whl", hash = "sha256:27c6e8077956cf73eadd514be8fb04d77fc946a7fe9f7fe167648b0b9085cc25", size = 39647, upload-time = "2025-10-06T05:36:03.409Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/f5/603d0d6a02cfd4c8f2a095a54672b3cf967ad688a60fb9faf04fc4887f65/frozenlist-1.8.0-cp311-cp311-win_amd64.whl", hash = "sha256:ac913f8403b36a2c8610bbfd25b8013488533e71e62b4b4adce9c86c8cea905b", size = 44064, upload-time = "2025-10-06T05:36:04.368Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/16/c2c9ab44e181f043a86f9a8f84d5124b62dbcb3a02c0977ec72b9ac1d3e0/frozenlist-1.8.0-cp311-cp311-win_arm64.whl", hash = "sha256:d4d3214a0f8394edfa3e303136d0575eece0745ff2b47bd2cb2e66dd92d4351a", size = 39937, upload-time = "2025-10-06T05:36:05.669Z" },
     { url = "https://files.pythonhosted.org/packages/69/29/948b9aa87e75820a38650af445d2ef2b6b8a6fab1a23b6bb9e4ef0be2d59/frozenlist-1.8.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:78f7b9e5d6f2fdb88cdde9440dc147259b62b9d3b019924def9f6478be254ac1", size = 87782, upload-time = "2025-10-06T05:36:06.649Z" },
     { url = "https://files.pythonhosted.org/packages/64/80/4f6e318ee2a7c0750ed724fa33a4bdf1eacdc5a39a7a24e818a773cd91af/frozenlist-1.8.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:229bf37d2e4acdaf808fd3f06e854a4a7a3661e871b10dc1f8f1896a3b05f18b", size = 50594, upload-time = "2025-10-06T05:36:07.69Z" },
     { url = "https://files.pythonhosted.org/packages/2b/94/5c8a2b50a496b11dd519f4a24cb5496cf125681dd99e94c604ccdea9419a/frozenlist-1.8.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f833670942247a14eafbb675458b4e61c82e002a148f49e68257b79296e865c4", size = 50448, upload-time = "2025-10-06T05:36:08.78Z" },
@@ -1548,46 +1877,70 @@ name = "fsspec"
 version = "2026.2.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/51/7c/f60c259dcbf4f0c47cc4ddb8f7720d2dcdc8888c8e5ad84c73ea4531cc5b/fsspec-2026.2.0.tar.gz", hash = "sha256:6544e34b16869f5aacd5b90bdf1a71acb37792ea3ddf6125ee69a22a53fb8bff", size = 313441, upload-time = "2026-02-05T21:50:53.743Z" }
 wheels = [
@@ -1596,7 +1949,7 @@ wheels = [
 
 [package.optional-dependencies]
 http = [
-    { name = "aiohttp", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
+    { name = "aiohttp" },
 ]
 
 [[package]]
@@ -1604,22 +1957,13 @@ name = "fsspec"
 version = "2026.3.0"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'win32'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e1/cf/b50ddf667c15276a9ab15a70ef5f257564de271957933ffea49d2cdbcdfb/fsspec-2026.3.0.tar.gz", hash = "sha256:1ee6a0e28677557f8c2f994e3eea77db6392b4de9cd1f5d7a9e87a0ae9d01b41", size = 313547, upload-time = "2026-03-27T19:11:14.892Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
 ]
 
-[package.optional-dependencies]
-http = [
-    { name = "aiohttp", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-]
-
 [[package]]
 name = "gitdb"
 version = "4.0.12"
@@ -1646,7 +1990,7 @@ wheels = [
 
 [[package]]
 name = "google-api-core"
-version = "2.30.3"
+version = "2.30.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "google-auth" },
@@ -1655,22 +1999,22 @@ dependencies = [
     { name = "protobuf" },
     { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/16/ce/502a57fb0ec752026d24df1280b162294b22a0afb98a326084f9a979138b/google_api_core-2.30.3.tar.gz", hash = "sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b", size = 177001, upload-time = "2026-04-10T00:41:28.035Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/1a/2e/83ca41eb400eb228f9279ec14ed66f6475218b59af4c6daec2d5a509fe83/google_api_core-2.30.2.tar.gz", hash = "sha256:9a8113e1a88bdc09a7ff629707f2214d98d61c7f6ceb0ea38c42a095d02dc0f9", size = 176862, upload-time = "2026-04-02T21:23:44.876Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/03/15/e56f351cf6ef1cfea58e6ac226a7318ed1deb2218c4b3cc9bd9e4b786c5a/google_api_core-2.30.3-py3-none-any.whl", hash = "sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8", size = 173274, upload-time = "2026-04-09T22:57:16.198Z" },
+    { url = "https://files.pythonhosted.org/packages/84/e1/ebd5100cbb202e561c0c8b59e485ef3bd63fa9beb610f3fdcaea443f0288/google_api_core-2.30.2-py3-none-any.whl", hash = "sha256:a4c226766d6af2580577db1f1a51bf53cd262f722b49731ce7414c43068a9594", size = 173236, upload-time = "2026-04-02T21:23:06.395Z" },
 ]
 
 [[package]]
 name = "google-auth"
-version = "2.49.2"
+version = "2.49.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cryptography" },
     { name = "pyasn1-modules" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/c6/fc/e925290a1ad95c975c459e2df070fac2b90954e13a0370ac505dff78cb99/google_auth-2.49.2.tar.gz", hash = "sha256:c1ae38500e73065dcae57355adb6278cf8b5c8e391994ae9cbadbcb9631ab409", size = 333958, upload-time = "2026-04-10T00:41:21.888Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/ea/80/6a696a07d3d3b0a92488933532f03dbefa4a24ab80fb231395b9a2a1be77/google_auth-2.49.1.tar.gz", hash = "sha256:16d40da1c3c5a0533f57d268fe72e0ebb0ae1cc3b567024122651c045d879b64", size = 333825, upload-time = "2026-03-12T19:30:58.135Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/73/76/d241a5c927433420507215df6cac1b1fa4ac0ba7a794df42a84326c68da8/google_auth-2.49.2-py3-none-any.whl", hash = "sha256:c2720924dfc82dedb962c9f52cabb2ab16714fd0a6a707e40561d217574ed6d5", size = 240638, upload-time = "2026-04-10T00:41:14.501Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/eb/c6c2478d8a8d633460be40e2a8a6f8f429171997a35a96f81d3b680dec83/google_auth-2.49.1-py3-none-any.whl", hash = "sha256:195ebe3dca18eddd1b3db5edc5189b76c13e96f29e73043b923ebcf3f1a860f7", size = 240737, upload-time = "2026-03-12T19:30:53.159Z" },
 ]
 
 [[package]]
@@ -1700,6 +2044,25 @@ version = "3.4.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/86/94/a5935717b307d7c71fe877b52b884c6af707d2d2090db118a03fbd799369/greenlet-3.4.0.tar.gz", hash = "sha256:f50a96b64dafd6169e595a5c56c9146ef80333e67d4476a65a9c55f400fc22ff", size = 195913, upload-time = "2026-04-08T17:08:00.863Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/0c/bc/e30e1e3d5e8860b0e0ce4d2b16b2681b77fd13542fc0d72f7e3c22d16eff/greenlet-3.4.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:d18eae9a7fb0f499efcd146b8c9750a2e1f6e0e93b5a382b3481875354a430e6", size = 284315, upload-time = "2026-04-08T17:02:52.322Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/cc/e023ae1967d2a26737387cac083e99e47f65f58868bd155c4c80c01ec4e0/greenlet-3.4.0-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:636d2f95c309e35f650e421c23297d5011716be15d966e6328b367c9fc513a82", size = 601916, upload-time = "2026-04-08T16:24:35.533Z" },
+    { url = "https://files.pythonhosted.org/packages/67/32/5be1677954b6d8810b33abe94e3eb88726311c58fa777dc97e390f7caf5a/greenlet-3.4.0-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:234582c20af9742583c3b2ddfbdbb58a756cfff803763ffaae1ac7990a9fac31", size = 616399, upload-time = "2026-04-08T16:30:54.536Z" },
+    { url = "https://files.pythonhosted.org/packages/82/0a/3a4af092b09ea02bcda30f33fd7db397619132fe52c6ece24b9363130d34/greenlet-3.4.0-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ac6a5f618be581e1e0713aecec8e54093c235e5fa17d6d8eb7ffc487e2300508", size = 621077, upload-time = "2026-04-08T16:40:34.946Z" },
+    { url = "https://files.pythonhosted.org/packages/74/bf/2d58d5ea515704f83e34699128c9072a34bea27d2b6a556e102105fe62a5/greenlet-3.4.0-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:523677e69cd4711b5a014e37bc1fb3a29947c3e3a5bb6a527e1cc50312e5a398", size = 611978, upload-time = "2026-04-08T15:56:31.335Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/39/3786520a7d5e33ee87b3da2531f589a3882abf686a42a3773183a41ef010/greenlet-3.4.0-cp310-cp310-manylinux_2_39_riscv64.whl", hash = "sha256:d336d46878e486de7d9458653c722875547ac8d36a1cff9ffaf4a74a3c1f62eb", size = 416893, upload-time = "2026-04-08T16:43:02.392Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/69/6525049b6c179d8a923256304d8387b8bdd4acab1acf0407852463c6d514/greenlet-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:b45e45fe47a19051a396abb22e19e7836a59ee6c5a90f3be427343c37908d65b", size = 1571957, upload-time = "2026-04-08T16:26:17.041Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/6c/bbfb798b05fec736a0d24dc23e81b45bcee87f45a83cfb39db031853bddc/greenlet-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5434271357be07f3ad0936c312645853b7e689e679e29310e2de09a9ea6c3adf", size = 1637223, upload-time = "2026-04-08T15:57:27.556Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/7d/981fe0e7c07bd9d5e7eb18decb8590a11e3955878291f7a7de2e9c668eb7/greenlet-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:a19093fbad824ed7c0f355b5ff4214bffda5f1a7f35f29b31fcaa240cc0135ab", size = 237902, upload-time = "2026-04-08T17:03:14.16Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/c6/dba32cab7e3a625b011aa5647486e2d28423a48845a2998c126dd69c85e1/greenlet-3.4.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:805bebb4945094acbab757d34d6e1098be6de8966009ab9ca54f06ff492def58", size = 285504, upload-time = "2026-04-08T15:52:14.071Z" },
+    { url = "https://files.pythonhosted.org/packages/54/f4/7cb5c2b1feb9a1f50e038be79980dfa969aa91979e5e3a18fdbcfad2c517/greenlet-3.4.0-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:439fc2f12b9b512d9dfa681c5afe5f6b3232c708d13e6f02c845e0d9f4c2d8c6", size = 605476, upload-time = "2026-04-08T16:24:37.064Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/af/b66ab0b2f9a4c5a867c136bf66d9599f34f21a1bcca26a2884a29c450bd9/greenlet-3.4.0-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a70ed1cb0295bee1df57b63bf7f46b4e56a5c93709eea769c1fec1bb23a95875", size = 618336, upload-time = "2026-04-08T16:30:56.59Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/31/56c43d2b5de476f77d36ceeec436328533bff960a4cba9a07616e93063ab/greenlet-3.4.0-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c5696c42e6bb5cfb7c6ff4453789081c66b9b91f061e5e9367fa15792644e76", size = 625045, upload-time = "2026-04-08T16:40:37.111Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/5c/8c5633ece6ba611d64bf2770219a98dd439921d6424e4e8cf16b0ac74ea5/greenlet-3.4.0-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c660bce1940a1acae5f51f0a064f1bc785d07ea16efcb4bc708090afc4d69e83", size = 613515, upload-time = "2026-04-08T15:56:32.478Z" },
+    { url = "https://files.pythonhosted.org/packages/80/ca/704d4e2c90acb8bdf7ae593f5cbc95f58e82de95cc540fb75631c1054533/greenlet-3.4.0-cp311-cp311-manylinux_2_39_riscv64.whl", hash = "sha256:89995ce5ddcd2896d89615116dd39b9703bfa0c07b583b85b89bf1b5d6eddf81", size = 419745, upload-time = "2026-04-08T16:43:04.022Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/df/950d15bca0d90a0e7395eb777903060504cdb509b7b705631e8fb69ff415/greenlet-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ee407d4d1ca9dc632265aee1c8732c4a2d60adff848057cdebfe5fe94eb2c8a2", size = 1574623, upload-time = "2026-04-08T16:26:18.596Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/e7/0839afab829fcb7333c9ff6d80c040949510055d2d4d63251f0d1c7c804e/greenlet-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:956215d5e355fffa7c021d168728321fd4d31fd730ac609b1653b450f6a4bc71", size = 1639579, upload-time = "2026-04-08T15:57:29.231Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/2b/b4482401e9bcaf9f5c97f67ead38db89c19520ff6d0d6699979c6efcc200/greenlet-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:5cb614ace7c27571270354e9c9f696554d073f8aa9319079dcba466bbdead711", size = 238233, upload-time = "2026-04-08T17:02:54.286Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/4d/d8123a4e0bcd583d5cfc8ddae0bbe29c67aab96711be331a7cc935a35966/greenlet-3.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:04403ac74fe295a361f650818de93be11b5038a78f49ccfb64d3b1be8fbf1267", size = 235045, upload-time = "2026-04-08T17:04:05.072Z" },
     { url = "https://files.pythonhosted.org/packages/65/8b/3669ad3b3f247a791b2b4aceb3aa5a31f5f6817bf547e4e1ff712338145a/greenlet-3.4.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:1a54a921561dd9518d31d2d3db4d7f80e589083063ab4d3e2e950756ef809e1a", size = 286902, upload-time = "2026-04-08T15:52:12.138Z" },
     { url = "https://files.pythonhosted.org/packages/38/3e/3c0e19b82900873e2d8469b590a6c4b3dfd2b316d0591f1c26b38a4879a5/greenlet-3.4.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16dec271460a9a2b154e3b1c2fa1050ce6280878430320e85e08c166772e3f97", size = 606099, upload-time = "2026-04-08T16:24:38.408Z" },
     { url = "https://files.pythonhosted.org/packages/b5/33/99fef65e7754fc76a4ed14794074c38c9ed3394a5bd129d7f61b705f3168/greenlet-3.4.0-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90036ce224ed6fe75508c1907a77e4540176dcf0744473627785dd519c6f9996", size = 618837, upload-time = "2026-04-08T16:30:58.298Z" },
@@ -1750,6 +2113,26 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905, upload-time = "2026-03-30T08:49:10.502Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/cd/bb7b7e54084a344c03d68144450da7ddd5564e51a298ae1662de65f48e2d/grpcio-1.80.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:886457a7768e408cdce226ad1ca67d2958917d306523a0e21e1a2fdaa75c9c9c", size = 6050363, upload-time = "2026-03-30T08:46:20.894Z" },
+    { url = "https://files.pythonhosted.org/packages/16/02/1417f5c3460dea65f7a2e3c14e8b31e77f7ffb730e9bfadd89eda7a9f477/grpcio-1.80.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:7b641fc3f1dc647bfd80bd713addc68f6d145956f64677e56d9ebafc0bd72388", size = 12026037, upload-time = "2026-03-30T08:46:25.144Z" },
+    { url = "https://files.pythonhosted.org/packages/43/98/c910254eedf2cae368d78336a2de0678e66a7317d27c02522392f949b5c6/grpcio-1.80.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:33eb763f18f006dc7fee1e69831d38d23f5eccd15b2e0f92a13ee1d9242e5e02", size = 6602306, upload-time = "2026-03-30T08:46:27.593Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/f8/88ca4e78c077b2b2113d95da1e1ab43efd43d723c9a0397d26529c2c1a56/grpcio-1.80.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:52d143637e3872633fc7dd7c3c6a1c84e396b359f3a72e215f8bf69fd82084fc", size = 7301535, upload-time = "2026-03-30T08:46:29.556Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/96/f28660fe2fe0f153288bf4a04e4910b7309d442395135c88ed4f5b3b8b40/grpcio-1.80.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c51bf8ac4575af2e0678bccfb07e47321fc7acb5049b4482832c5c195e04e13a", size = 6808669, upload-time = "2026-03-30T08:46:31.984Z" },
+    { url = "https://files.pythonhosted.org/packages/47/eb/3f68a5e955779c00aeef23850e019c1c1d0e032d90633ba49c01ad5a96e0/grpcio-1.80.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:50a9871536d71c4fba24ee856abc03a87764570f0c457dd8db0b4018f379fed9", size = 7409489, upload-time = "2026-03-30T08:46:34.684Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/a7/d2f681a4bfb881be40659a309771f3bdfbfdb1190619442816c3f0ffc079/grpcio-1.80.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:a72d84ad0514db063e21887fbacd1fd7acb4d494a564cae22227cd45c7fbf199", size = 8423167, upload-time = "2026-03-30T08:46:36.833Z" },
+    { url = "https://files.pythonhosted.org/packages/97/8a/29b4589c204959aa35ce5708400a05bba72181807c45c47b3ec000c39333/grpcio-1.80.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f7691a6788ad9196872f95716df5bc643ebba13c97140b7a5ee5c8e75d1dea81", size = 7846761, upload-time = "2026-03-30T08:46:40.091Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/d2/ed143e097230ee121ac5848f6ff14372dba91289b10b536d54fb1b7cbae7/grpcio-1.80.0-cp310-cp310-win32.whl", hash = "sha256:46c2390b59d67f84e882694d489f5b45707c657832d7934859ceb8c33f467069", size = 4156534, upload-time = "2026-03-30T08:46:42.026Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/c9/df8279bb49b29409995e95efa85b72973d62f8aeff89abee58c91f393710/grpcio-1.80.0-cp310-cp310-win_amd64.whl", hash = "sha256:dc053420fc75749c961e2a4c906398d7c15725d36ccc04ae6d16093167223b58", size = 4889869, upload-time = "2026-03-30T08:46:44.219Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/db/1d56e5f5823257b291962d6c0ce106146c6447f405b60b234c4f222a7cde/grpcio-1.80.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:dfab85db094068ff42e2a3563f60ab3dddcc9d6488a35abf0132daec13209c8a", size = 6055009, upload-time = "2026-03-30T08:46:46.265Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/18/c83f3cad64c5ca63bca7e91e5e46b0d026afc5af9d0a9972472ceba294b3/grpcio-1.80.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:5c07e82e822e1161354e32da2662f741a4944ea955f9f580ec8fb409dd6f6060", size = 12035295, upload-time = "2026-03-30T08:46:49.099Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/8e/e14966b435be2dda99fbe89db9525ea436edc79780431a1c2875a3582644/grpcio-1.80.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:ba0915d51fd4ced2db5ff719f84e270afe0e2d4c45a7bdb1e8d036e4502928c2", size = 6610297, upload-time = "2026-03-30T08:46:52.123Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/26/d5eb38f42ce0e3fdc8174ea4d52036ef8d58cc4426cb800f2610f625dd75/grpcio-1.80.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:3cb8130ba457d2aa09fa6b7c3ed6b6e4e6a2685fce63cb803d479576c4d80e21", size = 7300208, upload-time = "2026-03-30T08:46:54.859Z" },
+    { url = "https://files.pythonhosted.org/packages/25/51/bd267c989f85a17a5b3eea65a6feb4ff672af41ca614e5a0279cc0ea381c/grpcio-1.80.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:09e5e478b3d14afd23f12e49e8b44c8684ac3c5f08561c43a5b9691c54d136ab", size = 6813442, upload-time = "2026-03-30T08:46:57.056Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/d9/d80eef735b19e9169e30164bbf889b46f9df9127598a83d174eb13a48b26/grpcio-1.80.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:00168469238b022500e486c1c33916acf2f2a9b2c022202cf8a1885d2e3073c1", size = 7414743, upload-time = "2026-03-30T08:46:59.682Z" },
+    { url = "https://files.pythonhosted.org/packages/de/f2/567f5bd5054398ed6b0509b9a30900376dcf2786bd936812098808b49d8d/grpcio-1.80.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8502122a3cc1714038e39a0b071acb1207ca7844208d5ea0d091317555ee7106", size = 8426046, upload-time = "2026-03-30T08:47:02.474Z" },
+    { url = "https://files.pythonhosted.org/packages/62/29/73ef0141b4732ff5eacd68430ff2512a65c004696997f70476a83e548e7e/grpcio-1.80.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ce1794f4ea6cc3ca29463f42d665c32ba1b964b48958a66497917fe9069f26e6", size = 7851641, upload-time = "2026-03-30T08:47:05.462Z" },
+    { url = "https://files.pythonhosted.org/packages/46/69/abbfa360eb229a8623bab5f5a4f8105e445bd38ce81a89514ba55d281ad0/grpcio-1.80.0-cp311-cp311-win32.whl", hash = "sha256:51b4a7189b0bef2aa30adce3c78f09c83526cf3dddb24c6a96555e3b97340440", size = 4154368, upload-time = "2026-03-30T08:47:08.027Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/d4/ae92206d01183b08613e846076115f5ac5991bae358d2a749fa864da5699/grpcio-1.80.0-cp311-cp311-win_amd64.whl", hash = "sha256:02e64bb0bb2da14d947a49e6f120a75e947250aebe65f9629b62bb1f5c14e6e9", size = 4894235, upload-time = "2026-03-30T08:47:10.839Z" },
     { url = "https://files.pythonhosted.org/packages/5c/e8/a2b749265eb3415abc94f2e619bbd9e9707bebdda787e61c593004ec927a/grpcio-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0", size = 6015616, upload-time = "2026-03-30T08:47:13.428Z" },
     { url = "https://files.pythonhosted.org/packages/3e/97/b1282161a15d699d1e90c360df18d19165a045ce1c343c7f313f5e8a0b77/grpcio-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2", size = 12014204, upload-time = "2026-03-30T08:47:15.873Z" },
     { url = "https://files.pythonhosted.org/packages/6e/5e/d319c6e997b50c155ac5a8cb12f5173d5b42677510e886d250d50264949d/grpcio-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de", size = 6563866, upload-time = "2026-03-30T08:47:18.588Z" },
@@ -1793,6 +2176,26 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/94/c8/1223f29c84a143ae9a56c084fc96894de0ba84b6e8d60a26241abd81d278/grpcio_tools-1.80.0.tar.gz", hash = "sha256:26052b19c6ce0dcf52d1024496aea3e2bdfa864159f06dc7b97b22d041a94b26", size = 6133212, upload-time = "2026-03-30T08:52:39.077Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/54/1de67f5080da305a258758a8deb33f85666fa759f56785042a80b114a53f/grpcio_tools-1.80.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:727477b9afa4b53f5ec70cafb41c3965d893835e0d4ea9b542fe3d0d005602bf", size = 2549601, upload-time = "2026-03-30T08:50:09.498Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/b4/6d57ea199c5b880d182a2234aafa9a686f9c54c708ea7be75bd19d5aa825/grpcio_tools-1.80.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:85fe8d15f146c62cb76f38d963e256392d287442b9232717d30ae9e3bbda9bc3", size = 5712717, upload-time = "2026-03-30T08:50:15.028Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/1a/5505ee2277d368b409c796c78f22ea34a2a517b7d16755247efd663dc7af/grpcio_tools-1.80.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:95f0fffb5ca00519f3b602f938169b4dfa04b165e03258323965a9dfe8cc4d80", size = 2595941, upload-time = "2026-03-30T08:50:17.299Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/39/7fc1d16d8b767805079d76365d73e82c88dfaf179034473dbc9fbccedb77/grpcio_tools-1.80.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:7a0106af212748823a6ebd8ffbd9043414216f47cae3835f3187de0a62c415d3", size = 2909304, upload-time = "2026-03-30T08:50:19.485Z" },
+    { url = "https://files.pythonhosted.org/packages/97/d8/276ee759755d8f34f2ca5e9d2debd1a59f29f66059fb790bc369f2236c26/grpcio_tools-1.80.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:31fd01a4038b5dfc4ec79504a17061344f670f851833411717fef66920f13cd7", size = 2660269, upload-time = "2026-03-30T08:50:21.266Z" },
+    { url = "https://files.pythonhosted.org/packages/51/04/a6bb47942ad52901d777a649324d3203cf19d487f1d446263637f7a5bf12/grpcio_tools-1.80.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:57da9e19607fac4a01c48ead333c0dd15d91ed38794dce1194eda308f73e2038", size = 3109798, upload-time = "2026-03-30T08:50:23.267Z" },
+    { url = "https://files.pythonhosted.org/packages/be/50/7ee69b2919916739787d725f205b878e8d1619dd30422b8278e324664669/grpcio_tools-1.80.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:90968f751851abb8b145593609800fa70c837e1c93ba0792c480b1c8d8bc29ef", size = 3658930, upload-time = "2026-03-30T08:50:25.458Z" },
+    { url = "https://files.pythonhosted.org/packages/92/61/6d50783092b0e8bbcb04152d5388bf50ecf3ea2f783d95288ff6c3bb00fa/grpcio_tools-1.80.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b69dc5d6376ab43406304d1e2fc61ccf960b287d4325d77c3d45448c37a9d2da", size = 3326562, upload-time = "2026-03-30T08:50:27.809Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/58/d272ba549f6b1f0d8504f5fc4cd0a296f2c495a64d6e987fe871c4151557/grpcio_tools-1.80.0-cp310-cp310-win32.whl", hash = "sha256:3e8dcfebe34cb54df095de3d5871a4562a85a29f26d0f8bb41ee2c3dcfb11c3c", size = 997620, upload-time = "2026-03-30T08:50:29.959Z" },
+    { url = "https://files.pythonhosted.org/packages/70/5f/9f45a9946a0298711c72ca48b2c1f46a7d0c207a44cd3e4bb59d04556ba3/grpcio_tools-1.80.0-cp310-cp310-win_amd64.whl", hash = "sha256:fc622ed4ca400695f41c9eae3266276c6ba007e4c28164ce53b44e7ccc5e492b", size = 1162466, upload-time = "2026-03-30T08:50:32.242Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/d7/225dc91e6cb4f8d4830f16a478a468e9c6f342dcdf8cacc3772cc1d1f607/grpcio_tools-1.80.0-cp311-cp311-linux_armv7l.whl", hash = "sha256:1c43e5c768578fe0c6de3dbfaabe64af642951e1aa05c487cacedda63fa6c6c4", size = 2549937, upload-time = "2026-03-30T08:50:34.651Z" },
+    { url = "https://files.pythonhosted.org/packages/97/3d/a3684cb7677f3bea8db434eae02a9ce30135d7a268cd473b1bc8041c4722/grpcio_tools-1.80.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:a225348456575f3ac7851d8e23163195e76d2a905ee340cf73f33da62fba08aa", size = 5713099, upload-time = "2026-03-30T08:50:37.158Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/81/5665c697173ec346076358bfbfed0f7386825852494593ca14386478dfee/grpcio_tools-1.80.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a9396f02820d3f51c368c2c9dee15c55c77636c91be48a4d5c702e98d6fe0fdc", size = 2595776, upload-time = "2026-03-30T08:50:39.087Z" },
+    { url = "https://files.pythonhosted.org/packages/03/4f/fb81384f08a8226fa079972ba88272ac6277581fc72e8ab234d74c7e065b/grpcio_tools-1.80.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:797c08460cae16b402326eac329aec720dccf45c9f9279b95a352792eb53cf0f", size = 2909144, upload-time = "2026-03-30T08:50:40.922Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/9c/c957618f1c2a3195ecf5e83b03edcb364c2c1391f74183cb76e5763fa536/grpcio_tools-1.80.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1872a867eb6217de19edb70a4ce4a374ced9d94293533dfd42fa649713f55bf4", size = 2660477, upload-time = "2026-03-30T08:50:42.766Z" },
+    { url = "https://files.pythonhosted.org/packages/42/c7/23913da184febfd4eaf04de256a26bc5ff0411a5feb753e2adcff10fa86a/grpcio_tools-1.80.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:db122ba5ee357e3bb14e8944d69bbebcbdae91d5eace29ed4df3edc53cbc6528", size = 3110164, upload-time = "2026-03-30T08:50:44.761Z" },
+    { url = "https://files.pythonhosted.org/packages/af/fa/b25ed85ebdb0396910eaa250b1346d75527d22fca586265416bd4330dcd5/grpcio_tools-1.80.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ddefd48c227e6f4d640fe576fac5fb2c4a8898196f513604c8ec7671b3b3d421", size = 3658988, upload-time = "2026-03-30T08:50:47.546Z" },
+    { url = "https://files.pythonhosted.org/packages/60/85/2a55147cc9645e2ed777d1afcd2dc68cb34ba6f6c726bd4378ddb001a5ea/grpcio_tools-1.80.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:970ec058fa469dd6dae6ebc687501c5da670d95dead75f62f5b0933dce2c9794", size = 3326662, upload-time = "2026-03-30T08:50:49.59Z" },
+    { url = "https://files.pythonhosted.org/packages/68/ed/b05bee2a992e6f9bda81909692ea920d0896cfa05c5c9dd77ba03f2d22fb/grpcio_tools-1.80.0-cp311-cp311-win32.whl", hash = "sha256:526b4402d47a0e9b31cd6087e42b7674784617916cc73c764e0bc35ed41b4ee5", size = 997969, upload-time = "2026-03-30T08:50:51.539Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/9a/cb50c8270e2f6285ff2761130ae257ac4e51789ded4b9d9710ce0381814d/grpcio_tools-1.80.0-cp311-cp311-win_amd64.whl", hash = "sha256:ee101ecda7231770f6a5da1024a9a6ed587a7785f8fe23ab8283f4a1acb3ffe6", size = 1162742, upload-time = "2026-03-30T08:50:54.232Z" },
     { url = "https://files.pythonhosted.org/packages/0c/b9/65929df8c9614792db900a8e45d4997fadbd1734c827da3f0eb1f2fe4866/grpcio_tools-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:d19d5a8244311947b96f749c417b32d144641c6953f1164824579e1f0a51d040", size = 2550856, upload-time = "2026-03-30T08:50:57.3Z" },
     { url = "https://files.pythonhosted.org/packages/28/17/af1557544d68d1aeca9d9ea53ed16524022d521fec6ba334ab3530e9c1a6/grpcio_tools-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:fb599a3dc89ed1bb24489a2724b2f6dd4cddbbf0f7bdd69c073477bab0dc7554", size = 5710883, upload-time = "2026-03-30T08:51:00.077Z" },
     { url = "https://files.pythonhosted.org/packages/cc/48/aa9b4f7519ca972bc40d315d5c28f05ca28fa08de13d4e8b69f551b798ab/grpcio_tools-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:623ee31fc2ff7df9a987b4f3d139c30af17ce46a861ae0e25fb8c112daa32dd8", size = 2598004, upload-time = "2026-03-30T08:51:02.102Z" },
@@ -1852,10 +2255,10 @@ name = "hatchling"
 version = "1.29.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "packaging" },
     { name = "pathspec" },
     { name = "pluggy" },
+    { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "trove-classifiers" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/cf/9c/b4cfe330cd4f49cff17fd771154730555fa4123beb7f292cf0098b4e6c20/hatchling-1.29.0.tar.gz", hash = "sha256:793c31816d952cee405b83488ce001c719f325d9cda69f1fc4cd750527640ea6", size = 55656, upload-time = "2026-02-23T19:42:06.539Z" }
@@ -1966,11 +2369,10 @@ version = "0.36.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
-    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "fsspec", version = "2026.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'win32' or extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "fsspec", version = "2026.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "hf-xet", marker = "platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "packaging" },
     { name = "pyyaml" },
     { name = "requests" },
     { name = "tqdm" },
@@ -1986,9 +2388,13 @@ name = "hypercorn"
 version = "0.18.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
     { name = "h11" },
     { name = "h2" },
     { name = "priority" },
+    { name = "taskgroup", marker = "python_full_version < '3.11'" },
+    { name = "tomli", marker = "python_full_version < '3.11'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
     { name = "wsproto" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/44/01/39f41a014b83dd5c795217362f2ca9071cf243e6a75bdcd6cd5b944658cc/hypercorn-0.18.0.tar.gz", hash = "sha256:d63267548939c46b0247dc8e5b45a9947590e35e64ee73a23c074aa3cf88e9da", size = 68420, upload-time = "2025-11-08T13:54:04.78Z" }
@@ -2098,74 +2504,99 @@ wheels = [
 
 [[package]]
 name = "jiter"
-version = "0.14.0"
+version = "0.13.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/6e/c1/0cddc6eb17d4c53a99840953f95dd3accdc5cfc7a337b0e9b26476276be9/jiter-0.14.0.tar.gz", hash = "sha256:e8a39e66dac7153cf3f964a12aad515afa8d74938ec5cc0018adcdae5367c79e", size = 165725, upload-time = "2026-04-10T14:28:42.01Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5a/68/7390a418f10897da93b158f2d5a8bd0bcd73a0f9ec3bb36917085bb759ef/jiter-0.14.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:2fb2ce3a7bc331256dfb14cefc34832366bb28a9aca81deaf43bbf2a5659e607", size = 316295, upload-time = "2026-04-10T14:26:24.887Z" },
-    { url = "https://files.pythonhosted.org/packages/60/a0/5854ac00ff63551c52c6c89534ec6aba4b93474e7924d64e860b1c94165b/jiter-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5252a7ca23785cef5d02d4ece6077a1b556a410c591b379f82091c3001e14844", size = 315898, upload-time = "2026-04-10T14:26:26.601Z" },
-    { url = "https://files.pythonhosted.org/packages/41/a1/4f44832650a16b18e8391f1bf1d6ca4909bc738351826bcc198bba4357f4/jiter-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c409578cbd77c338975670ada777add4efd53379667edf0aceea730cabede6fb", size = 343730, upload-time = "2026-04-10T14:26:28.326Z" },
-    { url = "https://files.pythonhosted.org/packages/48/64/a329e9d469f86307203594b1707e11ae51c3348d03bfd514a5f997870012/jiter-0.14.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7ede4331a1899d604463369c730dbb961ffdc5312bc7f16c41c2896415b1304a", size = 370102, upload-time = "2026-04-10T14:26:30.089Z" },
-    { url = "https://files.pythonhosted.org/packages/94/c1/5e3dfc59635aa4d4c7bd20a820ac1d09b8ed851568356802cf1c08edb3cf/jiter-0.14.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:92cd8b6025981a041f5310430310b55b25ca593972c16407af8837d3d7d2ca01", size = 461335, upload-time = "2026-04-10T14:26:31.911Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/1b/dd157009dbc058f7b00108f545ccb72a2d56461395c4fc7b9cfdccb00af4/jiter-0.14.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:351bf6eda4e3a7ceb876377840c702e9a3e4ecc4624dbfb2d6463c67ae52637d", size = 378536, upload-time = "2026-04-10T14:26:33.595Z" },
-    { url = "https://files.pythonhosted.org/packages/91/78/256013667b7c10b8834f8e6e54cd3e562d4c6e34227a1596addccc05e38c/jiter-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1dcfbeb93d9ecd9ca128bbf8910120367777973fa193fb9a39c31237d8df165", size = 353859, upload-time = "2026-04-10T14:26:35.098Z" },
-    { url = "https://files.pythonhosted.org/packages/de/d9/137d65ade9093a409fe80955ce60b12bb753722c986467aeda47faf450ad/jiter-0.14.0-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:ae039aaef8de3f8157ecc1fdd4d85043ac4f57538c245a0afaecb8321ec951c3", size = 357626, upload-time = "2026-04-10T14:26:36.685Z" },
-    { url = "https://files.pythonhosted.org/packages/2e/48/76750835b87029342727c1a268bea8878ab988caf81ee4e7b880900eeb5a/jiter-0.14.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7d9d51eb96c82a9652933bd769fe6de66877d6eb2b2440e281f2938c51b5643e", size = 393172, upload-time = "2026-04-10T14:26:38.097Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/60/456c4e81d5c8045279aefe60e9e483be08793828800a4e64add8fdde7f2a/jiter-0.14.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d824ca4148b705970bf4e120924a212fdfca9859a73e42bd7889a63a4ea6bb98", size = 520300, upload-time = "2026-04-10T14:26:39.532Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/9f/2020e0984c235f678dced38fe4eec3058cf528e6af36ebf969b410305941/jiter-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:ff3a6465b3a0f54b1a430f45c3c0ba7d61ceb45cbc3e33f9e1a7f638d690baf3", size = 553059, upload-time = "2026-04-10T14:26:40.991Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/32/e2d298e1a22a4bbe6062136d1c7192db7dba003a6975e51d9a9eecabc4c2/jiter-0.14.0-cp312-cp312-win32.whl", hash = "sha256:5dec7c0a3e98d2a3f8a2e67382d0d7c3ac60c69103a4b271da889b4e8bb1e129", size = 206030, upload-time = "2026-04-10T14:26:42.517Z" },
-    { url = "https://files.pythonhosted.org/packages/36/ac/96369141b3d8a4a8e4590e983085efe1c436f35c0cda940dd76d942e3e40/jiter-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:fc7e37b4b8bc7e80a63ad6cfa5fc11fab27dbfea4cc4ae644b1ab3f273dc348f", size = 201603, upload-time = "2026-04-10T14:26:44.328Z" },
-    { url = "https://files.pythonhosted.org/packages/01/c3/75d847f264647017d7e3052bbcc8b1e24b95fa139c320c5f5066fa7a0bdd/jiter-0.14.0-cp312-cp312-win_arm64.whl", hash = "sha256:ee4a72f12847ef29b072aee9ad5474041ab2924106bdca9fcf5d7d965853e057", size = 191525, upload-time = "2026-04-10T14:26:46Z" },
-    { url = "https://files.pythonhosted.org/packages/97/2a/09f70020898507a89279659a1afe3364d57fc1b2c89949081975d135f6f5/jiter-0.14.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:af72f204cf4d44258e5b4c1745130ac45ddab0e71a06333b01de660ab4187a94", size = 315502, upload-time = "2026-04-10T14:26:47.697Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/be/080c96a45cd74f9fce5db4fd68510b88087fb37ffe2541ff73c12db92535/jiter-0.14.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4b77da71f6e819be5fbcec11a453fde5b1d0267ef6ed487e2a392fd8e14e4e3a", size = 314870, upload-time = "2026-04-10T14:26:49.149Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/5e/2d0fee155826a968a832cc32438de5e2a193292c8721ca70d0b53e58245b/jiter-0.14.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77f4ea612fe8b84b8b04e51d0e78029ecf3466348e25973f953de6e6a59aa4c1", size = 343406, upload-time = "2026-04-10T14:26:50.762Z" },
-    { url = "https://files.pythonhosted.org/packages/70/af/bf9ee0d3a4f8dc0d679fc1337f874fe60cdbf841ebbb304b374e1c9aaceb/jiter-0.14.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:62fe2451f8fcc0240261e6a4df18ecbcd58327857e61e625b2393ea3b468aac9", size = 369415, upload-time = "2026-04-10T14:26:52.188Z" },
-    { url = "https://files.pythonhosted.org/packages/0f/83/8e8561eadba31f4d3948a5b712fb0447ec71c3560b57a855449e7b8ddc98/jiter-0.14.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6112f26f5afc75bcb475787d29da3aa92f9d09c7858f632f4be6ffe607be82e9", size = 461456, upload-time = "2026-04-10T14:26:53.611Z" },
-    { url = "https://files.pythonhosted.org/packages/f6/c9/c5299e826a5fe6108d172b344033f61c69b1bb979dd8d9ddd4278a160971/jiter-0.14.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:215a6cb8fb7dc702aa35d475cc00ddc7f970e5c0b1417fb4b4ac5d82fa2a29db", size = 378488, upload-time = "2026-04-10T14:26:55.211Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/37/c16d9d15c0a471b8644b1abe3c82668092a707d9bedcf076f24ff2e380cd/jiter-0.14.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4ab96a30fb3cb2c7e0cd33f7616c8860da5f5674438988a54ac717caccdbaa", size = 353242, upload-time = "2026-04-10T14:26:56.705Z" },
-    { url = "https://files.pythonhosted.org/packages/58/ea/8050cb0dc654e728e1bfacbc0c640772f2181af5dedd13ae70145743a439/jiter-0.14.0-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:3a99c1387b1f2928f799a9de899193484d66206a50e98233b6b088a7f0c1edb2", size = 356823, upload-time = "2026-04-10T14:26:58.281Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/3b/cf71506d270e5f84d97326bf220e47aed9b95e9a4a060758fb07772170ab/jiter-0.14.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ab18d11074485438695f8d34a1b6da61db9754248f96d51341956607a8f39985", size = 392564, upload-time = "2026-04-10T14:27:00.018Z" },
-    { url = "https://files.pythonhosted.org/packages/b0/cc/8c6c74a3efb5bd671bfd14f51e8a73375464ca914b1551bc3b40e26ac2c9/jiter-0.14.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:801028dcfc26ac0895e4964cbc0fd62c73be9fd4a7d7b1aaf6e5790033a719b7", size = 520322, upload-time = "2026-04-10T14:27:01.664Z" },
-    { url = "https://files.pythonhosted.org/packages/41/24/68d7b883ec959884ddf00d019b2e0e82ba81b167e1253684fa90519ce33c/jiter-0.14.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ad425b087aafb4a1c7e1e98a279200743b9aaf30c3e0ba723aec93f061bd9bc8", size = 552619, upload-time = "2026-04-10T14:27:03.316Z" },
-    { url = "https://files.pythonhosted.org/packages/b6/89/b1a0985223bbf3150ff9e8f46f98fc9360c1de94f48abe271bbe1b465682/jiter-0.14.0-cp313-cp313-win32.whl", hash = "sha256:882bcb9b334318e233950b8be366fe5f92c86b66a7e449e76975dfd6d776a01f", size = 205699, upload-time = "2026-04-10T14:27:04.662Z" },
-    { url = "https://files.pythonhosted.org/packages/4c/19/3f339a5a7f14a11730e67f6be34f9d5105751d547b615ef593fa122a5ded/jiter-0.14.0-cp313-cp313-win_amd64.whl", hash = "sha256:9b8c571a5dba09b98bd3462b5a53f27209a5cbbe85670391692ede71974e979f", size = 201323, upload-time = "2026-04-10T14:27:06.139Z" },
-    { url = "https://files.pythonhosted.org/packages/50/56/752dd89c84be0e022a8ea3720bcfa0a8431db79a962578544812ce061739/jiter-0.14.0-cp313-cp313-win_arm64.whl", hash = "sha256:34f19dcc35cb1abe7c369b3756babf8c7f04595c0807a848df8f26ef8298ef92", size = 191099, upload-time = "2026-04-10T14:27:07.564Z" },
-    { url = "https://files.pythonhosted.org/packages/91/28/292916f354f25a1fe8cf2c918d1415c699a4a659ae00be0430e1c5d9ffea/jiter-0.14.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e89bcd7d426a75bb4952c696b267075790d854a07aad4c9894551a82c5b574ab", size = 320880, upload-time = "2026-04-10T14:27:09.326Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/c7/b002a7d8b8957ac3d469bd59c18ef4b1595a5216ae0de639a287b9816023/jiter-0.14.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7b25beaa0d4447ea8c7ae0c18c688905d34840d7d0b937f2f7bdd52162c98a40", size = 346563, upload-time = "2026-04-10T14:27:11.287Z" },
-    { url = "https://files.pythonhosted.org/packages/f9/3b/f8d07580d8706021d255a6356b8fab13ee4c869412995550ce6ed4ddf97d/jiter-0.14.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:651a8758dd413c51e3b7f6557cdc6921faf70b14106f45f969f091f5cda990ea", size = 357928, upload-time = "2026-04-10T14:27:12.729Z" },
-    { url = "https://files.pythonhosted.org/packages/47/5b/ac1a974da29e35507230383110ffec59998b290a8732585d04e19a9eb5ba/jiter-0.14.0-cp313-cp313t-win_amd64.whl", hash = "sha256:e1a7eead856a5038a8d291f1447176ab0b525c77a279a058121b5fccee257f6f", size = 203519, upload-time = "2026-04-10T14:27:14.125Z" },
-    { url = "https://files.pythonhosted.org/packages/96/6d/9fc8433d667d2454271378a79747d8c76c10b51b482b454e6190e511f244/jiter-0.14.0-cp313-cp313t-win_arm64.whl", hash = "sha256:2e692633a12cda97e352fdcd1c4acc971b1c28707e1e33aeef782b0cbf051975", size = 190113, upload-time = "2026-04-10T14:27:16.638Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/1e/354ed92461b165bd581f9ef5150971a572c873ec3b68a916d5aa91da3cc2/jiter-0.14.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:6f396837fc7577871ca8c12edaf239ed9ccef3bbe39904ae9b8b63ce0a48b140", size = 315277, upload-time = "2026-04-10T14:27:18.109Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/95/8c7c7028aa8636ac21b7a55faef3e34215e6ed0cbf5ae58258427f621aa3/jiter-0.14.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a4d50ea3d8ba4176f79754333bd35f1bbcd28e91adc13eb9b7ca91bc52a6cef9", size = 315923, upload-time = "2026-04-10T14:27:19.603Z" },
-    { url = "https://files.pythonhosted.org/packages/47/40/e2a852a44c4a089f2681a16611b7ce113224a80fd8504c46d78491b47220/jiter-0.14.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce17f8a050447d1b4153bda4fb7d26e6a9e74eb4f4a41913f30934c5075bf615", size = 344943, upload-time = "2026-04-10T14:27:21.262Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/1f/670f92adee1e9895eac41e8a4d623b6da68c4d46249d8b556b60b63f949e/jiter-0.14.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f4f1c4b125e1652aefbc2e2c1617b60a160ab789d180e3d423c41439e5f32850", size = 369725, upload-time = "2026-04-10T14:27:22.766Z" },
-    { url = "https://files.pythonhosted.org/packages/01/2f/541c9ba567d05de1c4874a0f8f8c5e3fd78e2b874266623da9a775cf46e0/jiter-0.14.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be808176a6a3a14321d18c603f2d40741858a7c4fc982f83232842689fe86dd9", size = 461210, upload-time = "2026-04-10T14:27:24.315Z" },
-    { url = "https://files.pythonhosted.org/packages/ce/a9/c31cbec09627e0d5de7aeaec7690dba03e090caa808fefd8133137cf45bc/jiter-0.14.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:26679d58ba816f88c3849306dd58cb863a90a1cf352cdd4ef67e30ccf8a77994", size = 380002, upload-time = "2026-04-10T14:27:26.155Z" },
-    { url = "https://files.pythonhosted.org/packages/50/02/3c05c1666c41904a2f607475a73e7a4763d1cbde2d18229c4f85b22dc253/jiter-0.14.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80381f5a19af8fa9aef743f080e34f6b25ebd89656475f8cf0470ec6157052aa", size = 354678, upload-time = "2026-04-10T14:27:27.701Z" },
-    { url = "https://files.pythonhosted.org/packages/7d/97/e15b33545c2b13518f560d695f974b9891b311641bdcf178d63177e8801e/jiter-0.14.0-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:004df5fdb8ecbd6d99f3227df18ba1a259254c4359736a2e6f036c944e02d7c5", size = 358920, upload-time = "2026-04-10T14:27:29.256Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/d2/8b1461def6b96ba44530df20d07ef7a1c7da22f3f9bf1727e2d611077bf1/jiter-0.14.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:cff5708f7ed0fa098f2b53446c6fa74c48469118e5cd7497b4f1cd569ab06928", size = 394512, upload-time = "2026-04-10T14:27:31.344Z" },
-    { url = "https://files.pythonhosted.org/packages/e3/88/837566dd6ed6e452e8d3205355afd484ce44b2533edfa4ed73a298ea893e/jiter-0.14.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:2492e5f06c36a976d25c7cc347a60e26d5470178d44cde1b9b75e60b4e519f28", size = 521120, upload-time = "2026-04-10T14:27:33.299Z" },
-    { url = "https://files.pythonhosted.org/packages/89/6b/b00b45c4d1b4c031777fe161d620b755b5b02cdade1e316dcb46e4471d63/jiter-0.14.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:7609cfbe3a03d37bfdbf5052012d5a879e72b83168a363deae7b3a26564d57de", size = 553668, upload-time = "2026-04-10T14:27:34.868Z" },
-    { url = "https://files.pythonhosted.org/packages/ad/d8/6fe5b42011d19397433d345716eac16728ac241862a2aac9c91923c7509a/jiter-0.14.0-cp314-cp314-win32.whl", hash = "sha256:7282342d32e357543565286b6450378c3cd402eea333fc1ebe146f1fabb306fc", size = 207001, upload-time = "2026-04-10T14:27:36.455Z" },
-    { url = "https://files.pythonhosted.org/packages/e5/43/5c2e08da1efad5e410f0eaaabeadd954812612c33fbbd8fd5328b489139d/jiter-0.14.0-cp314-cp314-win_amd64.whl", hash = "sha256:bd77945f38866a448e73b0b7637366afa814d4617790ecd88a18ca74377e6c02", size = 202187, upload-time = "2026-04-10T14:27:38Z" },
-    { url = "https://files.pythonhosted.org/packages/aa/1f/6e39ac0b4cdfa23e606af5b245df5f9adaa76f35e0c5096790da430ca506/jiter-0.14.0-cp314-cp314-win_arm64.whl", hash = "sha256:f2d4c61da0821ee42e0cdf5489da60a6d074306313a377c2b35af464955a3611", size = 192257, upload-time = "2026-04-10T14:27:39.504Z" },
-    { url = "https://files.pythonhosted.org/packages/05/57/7dbc0ffbbb5176a27e3518716608aa464aee2e2887dc938f0b900a120449/jiter-0.14.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:1bf7ff85517dd2f20a5750081d2b75083c1b269cf75afc7511bdf1f9548beb3b", size = 323441, upload-time = "2026-04-10T14:27:41.039Z" },
-    { url = "https://files.pythonhosted.org/packages/83/6e/7b3314398d8983f06b557aa21b670511ec72d3b79a68ee5e4d9bff972286/jiter-0.14.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c8ef8791c3e78d6c6b157c6d360fbb5c715bebb8113bc6a9303c5caff012754a", size = 348109, upload-time = "2026-04-10T14:27:42.552Z" },
-    { url = "https://files.pythonhosted.org/packages/ae/4f/8dc674bcd7db6dba566de73c08c763c337058baff1dbeb34567045b27cdc/jiter-0.14.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e74663b8b10da1fe0f4e4703fd7980d24ad17174b6bb35d8498d6e3ebce2ae6a", size = 368328, upload-time = "2026-04-10T14:27:44.574Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/5f/188e09a1f20906f98bbdec44ed820e19f4e8eb8aff88b9d1a5a497587ff3/jiter-0.14.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1aca29ba52913f78362ec9c2da62f22cdc4c3083313403f90c15460979b84d9b", size = 463301, upload-time = "2026-04-10T14:27:46.717Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/f0/19046ef965ed8f349e8554775bb12ff4352f443fbe12b95d31f575891256/jiter-0.14.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8b39b7d87a952b79949af5fef44d2544e58c21a28da7f1bae3ef166455c61746", size = 378891, upload-time = "2026-04-10T14:27:48.32Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/c3/da43bd8431ee175695777ee78cf0e93eacbb47393ff493f18c45231b427d/jiter-0.14.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:78d918a68b26e9fab068c2b5453577ef04943ab2807b9a6275df2a812599a310", size = 360749, upload-time = "2026-04-10T14:27:49.88Z" },
-    { url = "https://files.pythonhosted.org/packages/72/26/e054771be889707c6161dbdec9c23d33a9ec70945395d70f07cfea1e9a6f/jiter-0.14.0-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:b08997c35aee1201c1a5361466a8fb9162d03ae7bf6568df70b6c859f1e654a4", size = 358526, upload-time = "2026-04-10T14:27:51.504Z" },
-    { url = "https://files.pythonhosted.org/packages/c3/0f/7bea65ea2a6d91f2bf989ff11a18136644392bf2b0497a1fa50934c30a9c/jiter-0.14.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:260bf7ca20704d58d41f669e5e9fe7fe2fa72901a6b324e79056f5d52e9c9be2", size = 393926, upload-time = "2026-04-10T14:27:53.368Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/a1/b1ff7d70deef61ac0b7c6c2f12d2ace950cdeecb4fdc94500a0926802857/jiter-0.14.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:37826e3df29e60f30a382f9294348d0238ef127f4b5d7f5f8da78b5b9e050560", size = 521052, upload-time = "2026-04-10T14:27:55.058Z" },
-    { url = "https://files.pythonhosted.org/packages/0b/7b/3b0649983cbaf15eda26a414b5b1982e910c67bd6f7b1b490f3cfc76896a/jiter-0.14.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:645be49c46f2900937ba0eaf871ad5183c96858c0af74b6becc7f4e367e36e06", size = 553716, upload-time = "2026-04-10T14:27:57.269Z" },
-    { url = "https://files.pythonhosted.org/packages/97/f8/33d78c83bd93ae0c0af05293a6660f88a1977caef39a6d72a84afab94ce0/jiter-0.14.0-cp314-cp314t-win32.whl", hash = "sha256:2f7877ed45118de283786178eceaf877110abacd04fde31efff3940ae9672674", size = 207957, upload-time = "2026-04-10T14:27:59.285Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/ac/2b760516c03e2227826d1f7025d89bf6bf6357a28fe75c2a2800873c50bf/jiter-0.14.0-cp314-cp314t-win_amd64.whl", hash = "sha256:14c0cb10337c49f5eafe8e7364daca5e29a020ea03580b8f8e6c597fed4e1588", size = 204690, upload-time = "2026-04-10T14:28:00.962Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/2e/a44c20c58aeed0355f2d326969a181696aeb551a25195f47563908a815be/jiter-0.14.0-cp314-cp314t-win_arm64.whl", hash = "sha256:5419d4aa2024961da9fe12a9cfe7484996735dca99e8e090b5c88595ef1951ff", size = 191338, upload-time = "2026-04-10T14:28:02.853Z" },
-    { url = "https://files.pythonhosted.org/packages/21/42/9042c3f3019de4adcb8c16591c325ec7255beea9fcd33a42a43f3b0b1000/jiter-0.14.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:fbd9e482663ca9d005d051330e4d2d8150bb208a209409c10f7e7dfdf7c49da9", size = 308810, upload-time = "2026-04-10T14:28:34.673Z" },
-    { url = "https://files.pythonhosted.org/packages/60/cf/a7e19b308bd86bb04776803b1f01a5f9a287a4c55205f4708827ee487fbf/jiter-0.14.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:33a20d838b91ef376b3a56896d5b04e725c7df5bc4864cc6569cf046a8d73b6d", size = 308443, upload-time = "2026-04-10T14:28:36.658Z" },
-    { url = "https://files.pythonhosted.org/packages/ca/44/e26ede3f0caeff93f222559cb0cc4ca68579f07d009d7b6010c5b586f9b1/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:432c4db5255d86a259efde91e55cb4c8d18c0521d844c9e2e7efcce3899fb016", size = 343039, upload-time = "2026-04-10T14:28:38.356Z" },
-    { url = "https://files.pythonhosted.org/packages/da/e9/1f9ada30cef7b05e74bb06f52127e7a724976c225f46adb65c37b1dadfb6/jiter-0.14.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:67f00d94b281174144d6532a04b66a12cb866cbdc47c3af3bfe2973677f9861a", size = 349613, upload-time = "2026-04-10T14:28:40.066Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/0d/5e/4ec91646aee381d01cdb9974e30882c9cd3b8c5d1079d6b5ff4af522439a/jiter-0.13.0.tar.gz", hash = "sha256:f2839f9c2c7e2dffc1bc5929a510e14ce0a946be9365fd1219e7ef342dae14f4", size = 164847, upload-time = "2026-02-02T12:37:56.441Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d0/5a/41da76c5ea07bec1b0472b6b2fdb1b651074d504b19374d7e130e0cdfb25/jiter-0.13.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:2ffc63785fd6c7977defe49b9824ae6ce2b2e2b77ce539bdaf006c26da06342e", size = 311164, upload-time = "2026-02-02T12:35:17.688Z" },
+    { url = "https://files.pythonhosted.org/packages/40/cb/4a1bf994a3e869f0d39d10e11efb471b76d0ad70ecbfb591427a46c880c2/jiter-0.13.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4a638816427006c1e3f0013eb66d391d7a3acda99a7b0cf091eff4497ccea33a", size = 320296, upload-time = "2026-02-02T12:35:19.828Z" },
+    { url = "https://files.pythonhosted.org/packages/09/82/acd71ca9b50ecebadc3979c541cd717cce2fe2bc86236f4fa597565d8f1a/jiter-0.13.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19928b5d1ce0ff8c1ee1b9bdef3b5bfc19e8304f1b904e436caf30bc15dc6cf5", size = 352742, upload-time = "2026-02-02T12:35:21.258Z" },
+    { url = "https://files.pythonhosted.org/packages/71/03/d1fc996f3aecfd42eb70922edecfb6dd26421c874503e241153ad41df94f/jiter-0.13.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:309549b778b949d731a2f0e1594a3f805716be704a73bf3ad9a807eed5eb5721", size = 363145, upload-time = "2026-02-02T12:35:24.653Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/61/a30492366378cc7a93088858f8991acd7d959759fe6138c12a4644e58e81/jiter-0.13.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bcdabaea26cb04e25df3103ce47f97466627999260290349a88c8136ecae0060", size = 487683, upload-time = "2026-02-02T12:35:26.162Z" },
+    { url = "https://files.pythonhosted.org/packages/20/4e/4223cffa9dbbbc96ed821c5aeb6bca510848c72c02086d1ed3f1da3d58a7/jiter-0.13.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a3a377af27b236abbf665a69b2bdd680e3b5a0bd2af825cd3b81245279a7606c", size = 373579, upload-time = "2026-02-02T12:35:27.582Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/c9/b0489a01329ab07a83812d9ebcffe7820a38163c6d9e7da644f926ff877c/jiter-0.13.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fe49d3ff6db74321f144dff9addd4a5874d3105ac5ba7c5b77fac099cfae31ae", size = 362904, upload-time = "2026-02-02T12:35:28.925Z" },
+    { url = "https://files.pythonhosted.org/packages/05/af/53e561352a44afcba9a9bc67ee1d320b05a370aed8df54eafe714c4e454d/jiter-0.13.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2113c17c9a67071b0f820733c0893ed1d467b5fcf4414068169e5c2cabddb1e2", size = 392380, upload-time = "2026-02-02T12:35:30.385Z" },
+    { url = "https://files.pythonhosted.org/packages/76/2a/dd805c3afb8ed5b326c5ae49e725d1b1255b9754b1b77dbecdc621b20773/jiter-0.13.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:ab1185ca5c8b9491b55ebf6c1e8866b8f68258612899693e24a92c5fdb9455d5", size = 517939, upload-time = "2026-02-02T12:35:31.865Z" },
+    { url = "https://files.pythonhosted.org/packages/20/2a/7b67d76f55b8fe14c937e7640389612f05f9a4145fc28ae128aaa5e62257/jiter-0.13.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:9621ca242547edc16400981ca3231e0c91c0c4c1ab8573a596cd9bb3575d5c2b", size = 551696, upload-time = "2026-02-02T12:35:33.306Z" },
+    { url = "https://files.pythonhosted.org/packages/85/9c/57cdd64dac8f4c6ab8f994fe0eb04dc9fd1db102856a4458fcf8a99dfa62/jiter-0.13.0-cp310-cp310-win32.whl", hash = "sha256:a7637d92b1c9d7a771e8c56f445c7f84396d48f2e756e5978840ecba2fac0894", size = 204592, upload-time = "2026-02-02T12:35:34.58Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/38/f4f3ea5788b8a5bae7510a678cdc747eda0c45ffe534f9878ff37e7cf3b3/jiter-0.13.0-cp310-cp310-win_amd64.whl", hash = "sha256:c1b609e5cbd2f52bb74fb721515745b407df26d7b800458bd97cb3b972c29e7d", size = 206016, upload-time = "2026-02-02T12:35:36.435Z" },
+    { url = "https://files.pythonhosted.org/packages/71/29/499f8c9eaa8a16751b1c0e45e6f5f1761d180da873d417996cc7bddc8eef/jiter-0.13.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ea026e70a9a28ebbdddcbcf0f1323128a8db66898a06eaad3a4e62d2f554d096", size = 311157, upload-time = "2026-02-02T12:35:37.758Z" },
+    { url = "https://files.pythonhosted.org/packages/50/f6/566364c777d2ab450b92100bea11333c64c38d32caf8dc378b48e5b20c46/jiter-0.13.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:66aa3e663840152d18cc8ff1e4faad3dd181373491b9cfdc6004b92198d67911", size = 319729, upload-time = "2026-02-02T12:35:39.246Z" },
+    { url = "https://files.pythonhosted.org/packages/73/dd/560f13ec5e4f116d8ad2658781646cca91b617ae3b8758d4a5076b278f70/jiter-0.13.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c3524798e70655ff19aec58c7d05adb1f074fecff62da857ea9be2b908b6d701", size = 354766, upload-time = "2026-02-02T12:35:40.662Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/0d/061faffcfe94608cbc28a0d42a77a74222bdf5055ccdbe5fd2292b94f510/jiter-0.13.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ec7e287d7fbd02cb6e22f9a00dd9c9cd504c40a61f2c61e7e1f9690a82726b4c", size = 362587, upload-time = "2026-02-02T12:35:42.025Z" },
+    { url = "https://files.pythonhosted.org/packages/92/c9/c66a7864982fd38a9773ec6e932e0398d1262677b8c60faecd02ffb67bf3/jiter-0.13.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:47455245307e4debf2ce6c6e65a717550a0244231240dcf3b8f7d64e4c2f22f4", size = 487537, upload-time = "2026-02-02T12:35:43.459Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/86/84eb4352cd3668f16d1a88929b5888a3fe0418ea8c1dfc2ad4e7bf6e069a/jiter-0.13.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ee9da221dca6e0429c2704c1b3655fe7b025204a71d4d9b73390c759d776d165", size = 373717, upload-time = "2026-02-02T12:35:44.928Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/09/9fe4c159358176f82d4390407a03f506a8659ed13ca3ac93a843402acecf/jiter-0.13.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:24ab43126d5e05f3d53a36a8e11eb2f23304c6c1117844aaaf9a0aa5e40b5018", size = 362683, upload-time = "2026-02-02T12:35:46.636Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/5e/85f3ab9caca0c1d0897937d378b4a515cae9e119730563572361ea0c48ae/jiter-0.13.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9da38b4fedde4fb528c740c2564628fbab737166a0e73d6d46cb4bb5463ff411", size = 392345, upload-time = "2026-02-02T12:35:48.088Z" },
+    { url = "https://files.pythonhosted.org/packages/12/4c/05b8629ad546191939e6f0c2f17e29f542a398f4a52fb987bc70b6d1eb8b/jiter-0.13.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0b34c519e17658ed88d5047999a93547f8889f3c1824120c26ad6be5f27b6cf5", size = 517775, upload-time = "2026-02-02T12:35:49.482Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/88/367ea2eb6bc582c7052e4baf5ddf57ebe5ab924a88e0e09830dfb585c02d/jiter-0.13.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d2a6394e6af690d462310a86b53c47ad75ac8c21dc79f120714ea449979cb1d3", size = 551325, upload-time = "2026-02-02T12:35:51.104Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/12/fa377ffb94a2f28c41afaed093e0d70cfe512035d5ecb0cad0ae4792d35e/jiter-0.13.0-cp311-cp311-win32.whl", hash = "sha256:0f0c065695f616a27c920a56ad0d4fc46415ef8b806bf8fc1cacf25002bd24e1", size = 204709, upload-time = "2026-02-02T12:35:52.467Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/16/8e8203ce92f844dfcd3d9d6a5a7322c77077248dbb12da52d23193a839cd/jiter-0.13.0-cp311-cp311-win_amd64.whl", hash = "sha256:0733312953b909688ae3c2d58d043aa040f9f1a6a75693defed7bc2cc4bf2654", size = 204560, upload-time = "2026-02-02T12:35:53.925Z" },
+    { url = "https://files.pythonhosted.org/packages/44/26/97cc40663deb17b9e13c3a5cf29251788c271b18ee4d262c8f94798b8336/jiter-0.13.0-cp311-cp311-win_arm64.whl", hash = "sha256:5d9b34ad56761b3bf0fbe8f7e55468704107608512350962d3317ffd7a4382d5", size = 189608, upload-time = "2026-02-02T12:35:55.304Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/30/7687e4f87086829955013ca12a9233523349767f69653ebc27036313def9/jiter-0.13.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:0a2bd69fc1d902e89925fc34d1da51b2128019423d7b339a45d9e99c894e0663", size = 307958, upload-time = "2026-02-02T12:35:57.165Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/27/e57f9a783246ed95481e6749cc5002a8a767a73177a83c63ea71f0528b90/jiter-0.13.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f917a04240ef31898182f76a332f508f2cc4b57d2b4d7ad2dbfebbfe167eb505", size = 318597, upload-time = "2026-02-02T12:35:58.591Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/52/e5719a60ac5d4d7c5995461a94ad5ef962a37c8bf5b088390e6fad59b2ff/jiter-0.13.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c1e2b199f446d3e82246b4fd9236d7cb502dc2222b18698ba0d986d2fecc6152", size = 348821, upload-time = "2026-02-02T12:36:00.093Z" },
+    { url = "https://files.pythonhosted.org/packages/61/db/c1efc32b8ba4c740ab3fc2d037d8753f67685f475e26b9d6536a4322bcdd/jiter-0.13.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:04670992b576fa65bd056dbac0c39fe8bd67681c380cb2b48efa885711d9d726", size = 364163, upload-time = "2026-02-02T12:36:01.937Z" },
+    { url = "https://files.pythonhosted.org/packages/55/8a/fb75556236047c8806995671a18e4a0ad646ed255276f51a20f32dceaeec/jiter-0.13.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5a1aff1fbdb803a376d4d22a8f63f8e7ccbce0b4890c26cc7af9e501ab339ef0", size = 483709, upload-time = "2026-02-02T12:36:03.41Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/16/43512e6ee863875693a8e6f6d532e19d650779d6ba9a81593ae40a9088ff/jiter-0.13.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b3fb8c2053acaef8580809ac1d1f7481a0a0bdc012fd7f5d8b18fb696a5a089", size = 370480, upload-time = "2026-02-02T12:36:04.791Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/4c/09b93e30e984a187bc8aaa3510e1ec8dcbdcd71ca05d2f56aac0492453aa/jiter-0.13.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bdaba7d87e66f26a2c45d8cbadcbfc4bf7884182317907baf39cfe9775bb4d93", size = 360735, upload-time = "2026-02-02T12:36:06.994Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/1b/46c5e349019874ec5dfa508c14c37e29864ea108d376ae26d90bee238cd7/jiter-0.13.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7b88d649135aca526da172e48083da915ec086b54e8e73a425ba50999468cc08", size = 391814, upload-time = "2026-02-02T12:36:08.368Z" },
+    { url = "https://files.pythonhosted.org/packages/15/9e/26184760e85baee7162ad37b7912797d2077718476bf91517641c92b3639/jiter-0.13.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e404ea551d35438013c64b4f357b0474c7abf9f781c06d44fcaf7a14c69ff9e2", size = 513990, upload-time = "2026-02-02T12:36:09.993Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/34/2c9355247d6debad57a0a15e76ab1566ab799388042743656e566b3b7de1/jiter-0.13.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:1f4748aad1b4a93c8bdd70f604d0f748cdc0e8744c5547798acfa52f10e79228", size = 548021, upload-time = "2026-02-02T12:36:11.376Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/4a/9f2c23255d04a834398b9c2e0e665382116911dc4d06b795710503cdad25/jiter-0.13.0-cp312-cp312-win32.whl", hash = "sha256:0bf670e3b1445fc4d31612199f1744f67f889ee1bbae703c4b54dc097e5dd394", size = 203024, upload-time = "2026-02-02T12:36:12.682Z" },
+    { url = "https://files.pythonhosted.org/packages/09/ee/f0ae675a957ae5a8f160be3e87acea6b11dc7b89f6b7ab057e77b2d2b13a/jiter-0.13.0-cp312-cp312-win_amd64.whl", hash = "sha256:15db60e121e11fe186c0b15236bd5d18381b9ddacdcf4e659feb96fc6c969c92", size = 205424, upload-time = "2026-02-02T12:36:13.93Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/02/ae611edf913d3cbf02c97cdb90374af2082c48d7190d74c1111dde08bcdd/jiter-0.13.0-cp312-cp312-win_arm64.whl", hash = "sha256:41f92313d17989102f3cb5dd533a02787cdb99454d494344b0361355da52fcb9", size = 186818, upload-time = "2026-02-02T12:36:15.308Z" },
+    { url = "https://files.pythonhosted.org/packages/91/9c/7ee5a6ff4b9991e1a45263bfc46731634c4a2bde27dfda6c8251df2d958c/jiter-0.13.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:1f8a55b848cbabf97d861495cd65f1e5c590246fabca8b48e1747c4dfc8f85bf", size = 306897, upload-time = "2026-02-02T12:36:16.748Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/02/be5b870d1d2be5dd6a91bdfb90f248fbb7dcbd21338f092c6b89817c3dbf/jiter-0.13.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f556aa591c00f2c45eb1b89f68f52441a016034d18b65da60e2d2875bbbf344a", size = 317507, upload-time = "2026-02-02T12:36:18.351Z" },
+    { url = "https://files.pythonhosted.org/packages/da/92/b25d2ec333615f5f284f3a4024f7ce68cfa0604c322c6808b2344c7f5d2b/jiter-0.13.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f7e1d61da332ec412350463891923f960c3073cf1aae93b538f0bb4c8cd46efb", size = 350560, upload-time = "2026-02-02T12:36:19.746Z" },
+    { url = "https://files.pythonhosted.org/packages/be/ec/74dcb99fef0aca9fbe56b303bf79f6bd839010cb18ad41000bf6cc71eec0/jiter-0.13.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3097d665a27bc96fd9bbf7f86178037db139f319f785e4757ce7ccbf390db6c2", size = 363232, upload-time = "2026-02-02T12:36:21.243Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/37/f17375e0bb2f6a812d4dd92d7616e41917f740f3e71343627da9db2824ce/jiter-0.13.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d01ecc3a8cbdb6f25a37bd500510550b64ddf9f7d64a107d92f3ccb25035d0f", size = 483727, upload-time = "2026-02-02T12:36:22.688Z" },
+    { url = "https://files.pythonhosted.org/packages/77/d2/a71160a5ae1a1e66c1395b37ef77da67513b0adba73b993a27fbe47eb048/jiter-0.13.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ed9bbc30f5d60a3bdf63ae76beb3f9db280d7f195dfcfa61af792d6ce912d159", size = 370799, upload-time = "2026-02-02T12:36:24.106Z" },
+    { url = "https://files.pythonhosted.org/packages/01/99/ed5e478ff0eb4e8aa5fd998f9d69603c9fd3f32de3bd16c2b1194f68361c/jiter-0.13.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98fbafb6e88256f4454de33c1f40203d09fc33ed19162a68b3b257b29ca7f663", size = 359120, upload-time = "2026-02-02T12:36:25.519Z" },
+    { url = "https://files.pythonhosted.org/packages/16/be/7ffd08203277a813f732ba897352797fa9493faf8dc7995b31f3d9cb9488/jiter-0.13.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5467696f6b827f1116556cb0db620440380434591e93ecee7fd14d1a491b6daa", size = 390664, upload-time = "2026-02-02T12:36:26.866Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/84/e0787856196d6d346264d6dcccb01f741e5f0bd014c1d9a2ebe149caf4f3/jiter-0.13.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:2d08c9475d48b92892583df9da592a0e2ac49bcd41fae1fec4f39ba6cf107820", size = 513543, upload-time = "2026-02-02T12:36:28.217Z" },
+    { url = "https://files.pythonhosted.org/packages/65/50/ecbd258181c4313cf79bca6c88fb63207d04d5bf5e4f65174114d072aa55/jiter-0.13.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:aed40e099404721d7fcaf5b89bd3b4568a4666358bcac7b6b15c09fb6252ab68", size = 547262, upload-time = "2026-02-02T12:36:29.678Z" },
+    { url = "https://files.pythonhosted.org/packages/27/da/68f38d12e7111d2016cd198161b36e1f042bd115c169255bcb7ec823a3bf/jiter-0.13.0-cp313-cp313-win32.whl", hash = "sha256:36ebfbcffafb146d0e6ffb3e74d51e03d9c35ce7c625c8066cdbfc7b953bdc72", size = 200630, upload-time = "2026-02-02T12:36:31.808Z" },
+    { url = "https://files.pythonhosted.org/packages/25/65/3bd1a972c9a08ecd22eb3b08a95d1941ebe6938aea620c246cf426ae09c2/jiter-0.13.0-cp313-cp313-win_amd64.whl", hash = "sha256:8d76029f077379374cf0dbc78dbe45b38dec4a2eb78b08b5194ce836b2517afc", size = 202602, upload-time = "2026-02-02T12:36:33.679Z" },
+    { url = "https://files.pythonhosted.org/packages/15/fe/13bd3678a311aa67686bb303654792c48206a112068f8b0b21426eb6851e/jiter-0.13.0-cp313-cp313-win_arm64.whl", hash = "sha256:bb7613e1a427cfcb6ea4544f9ac566b93d5bf67e0d48c787eca673ff9c9dff2b", size = 185939, upload-time = "2026-02-02T12:36:35.065Z" },
+    { url = "https://files.pythonhosted.org/packages/49/19/a929ec002ad3228bc97ca01dbb14f7632fffdc84a95ec92ceaf4145688ae/jiter-0.13.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:fa476ab5dd49f3bf3a168e05f89358c75a17608dbabb080ef65f96b27c19ab10", size = 316616, upload-time = "2026-02-02T12:36:36.579Z" },
+    { url = "https://files.pythonhosted.org/packages/52/56/d19a9a194afa37c1728831e5fb81b7722c3de18a3109e8f282bfc23e587a/jiter-0.13.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ade8cb6ff5632a62b7dbd4757d8c5573f7a2e9ae285d6b5b841707d8363205ef", size = 346850, upload-time = "2026-02-02T12:36:38.058Z" },
+    { url = "https://files.pythonhosted.org/packages/36/4a/94e831c6bf287754a8a019cb966ed39ff8be6ab78cadecf08df3bb02d505/jiter-0.13.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9950290340acc1adaded363edd94baebcee7dabdfa8bee4790794cd5cfad2af6", size = 358551, upload-time = "2026-02-02T12:36:39.417Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/ec/a4c72c822695fa80e55d2b4142b73f0012035d9fcf90eccc56bc060db37c/jiter-0.13.0-cp313-cp313t-win_amd64.whl", hash = "sha256:2b4972c6df33731aac0742b64fd0d18e0a69bc7d6e03108ce7d40c85fd9e3e6d", size = 201950, upload-time = "2026-02-02T12:36:40.791Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/00/393553ec27b824fbc29047e9c7cd4a3951d7fbe4a76743f17e44034fa4e4/jiter-0.13.0-cp313-cp313t-win_arm64.whl", hash = "sha256:701a1e77d1e593c1b435315ff625fd071f0998c5f02792038a5ca98899261b7d", size = 185852, upload-time = "2026-02-02T12:36:42.077Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/f5/f1997e987211f6f9bd71b8083047b316208b4aca0b529bb5f8c96c89ef3e/jiter-0.13.0-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:cc5223ab19fe25e2f0bf2643204ad7318896fe3729bf12fde41b77bfc4fafff0", size = 308804, upload-time = "2026-02-02T12:36:43.496Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/8f/5482a7677731fd44881f0204981ce2d7175db271f82cba2085dd2212e095/jiter-0.13.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:9776ebe51713acf438fd9b4405fcd86893ae5d03487546dae7f34993217f8a91", size = 318787, upload-time = "2026-02-02T12:36:45.071Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/b9/7257ac59778f1cd025b26a23c5520a36a424f7f1b068f2442a5b499b7464/jiter-0.13.0-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:879e768938e7b49b5e90b7e3fecc0dbec01b8cb89595861fb39a8967c5220d09", size = 353880, upload-time = "2026-02-02T12:36:47.365Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/87/719eec4a3f0841dad99e3d3604ee4cba36af4419a76f3cb0b8e2e691ad67/jiter-0.13.0-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:682161a67adea11e3aae9038c06c8b4a9a71023228767477d683f69903ebc607", size = 366702, upload-time = "2026-02-02T12:36:48.871Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/65/415f0a75cf6921e43365a1bc227c565cb949caca8b7532776e430cbaa530/jiter-0.13.0-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a13b68cd1cd8cc9de8f244ebae18ccb3e4067ad205220ef324c39181e23bbf66", size = 486319, upload-time = "2026-02-02T12:36:53.006Z" },
+    { url = "https://files.pythonhosted.org/packages/54/a2/9e12b48e82c6bbc6081fd81abf915e1443add1b13d8fc586e1d90bb02bb8/jiter-0.13.0-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87ce0f14c6c08892b610686ae8be350bf368467b6acd5085a5b65441e2bf36d2", size = 372289, upload-time = "2026-02-02T12:36:54.593Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/c1/e4693f107a1789a239c759a432e9afc592366f04e901470c2af89cfd28e1/jiter-0.13.0-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c365005b05505a90d1c47856420980d0237adf82f70c4aff7aebd3c1cc143ad", size = 360165, upload-time = "2026-02-02T12:36:56.112Z" },
+    { url = "https://files.pythonhosted.org/packages/17/08/91b9ea976c1c758240614bd88442681a87672eebc3d9a6dde476874e706b/jiter-0.13.0-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1317fdffd16f5873e46ce27d0e0f7f4f90f0cdf1d86bf6abeaea9f63ca2c401d", size = 389634, upload-time = "2026-02-02T12:36:57.495Z" },
+    { url = "https://files.pythonhosted.org/packages/18/23/58325ef99390d6d40427ed6005bf1ad54f2577866594bcf13ce55675f87d/jiter-0.13.0-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:c05b450d37ba0c9e21c77fef1f205f56bcee2330bddca68d344baebfc55ae0df", size = 514933, upload-time = "2026-02-02T12:36:58.909Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/25/69f1120c7c395fd276c3996bb8adefa9c6b84c12bb7111e5c6ccdcd8526d/jiter-0.13.0-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:775e10de3849d0631a97c603f996f518159272db00fdda0a780f81752255ee9d", size = 548842, upload-time = "2026-02-02T12:37:00.433Z" },
+    { url = "https://files.pythonhosted.org/packages/18/05/981c9669d86850c5fbb0d9e62bba144787f9fba84546ba43d624ee27ef29/jiter-0.13.0-cp314-cp314-win32.whl", hash = "sha256:632bf7c1d28421c00dd8bbb8a3bac5663e1f57d5cd5ed962bce3c73bf62608e6", size = 202108, upload-time = "2026-02-02T12:37:01.718Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/96/cdcf54dd0b0341db7d25413229888a346c7130bd20820530905fdb65727b/jiter-0.13.0-cp314-cp314-win_amd64.whl", hash = "sha256:f22ef501c3f87ede88f23f9b11e608581c14f04db59b6a801f354397ae13739f", size = 204027, upload-time = "2026-02-02T12:37:03.075Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/f9/724bcaaab7a3cd727031fe4f6995cb86c4bd344909177c186699c8dec51a/jiter-0.13.0-cp314-cp314-win_arm64.whl", hash = "sha256:07b75fe09a4ee8e0c606200622e571e44943f47254f95e2436c8bdcaceb36d7d", size = 187199, upload-time = "2026-02-02T12:37:04.414Z" },
+    { url = "https://files.pythonhosted.org/packages/62/92/1661d8b9fd6a3d7a2d89831db26fe3c1509a287d83ad7838831c7b7a5c7e/jiter-0.13.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:964538479359059a35fb400e769295d4b315ae61e4105396d355a12f7fef09f0", size = 318423, upload-time = "2026-02-02T12:37:05.806Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/3b/f77d342a54d4ebcd128e520fc58ec2f5b30a423b0fd26acdfc0c6fef8e26/jiter-0.13.0-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e104da1db1c0991b3eaed391ccd650ae8d947eab1480c733e5a3fb28d4313e40", size = 351438, upload-time = "2026-02-02T12:37:07.189Z" },
+    { url = "https://files.pythonhosted.org/packages/76/b3/ba9a69f0e4209bd3331470c723c2f5509e6f0482e416b612431a5061ed71/jiter-0.13.0-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0e3a5f0cde8ff433b8e88e41aa40131455420fb3649a3c7abdda6145f8cb7202", size = 364774, upload-time = "2026-02-02T12:37:08.579Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/16/6cdb31fa342932602458dbb631bfbd47f601e03d2e4950740e0b2100b570/jiter-0.13.0-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:57aab48f40be1db920a582b30b116fe2435d184f77f0e4226f546794cedd9cf0", size = 487238, upload-time = "2026-02-02T12:37:10.066Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/b1/956cc7abaca8d95c13aa8d6c9b3f3797241c246cd6e792934cc4c8b250d2/jiter-0.13.0-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7772115877c53f62beeb8fd853cab692dbc04374ef623b30f997959a4c0e7e95", size = 372892, upload-time = "2026-02-02T12:37:11.656Z" },
+    { url = "https://files.pythonhosted.org/packages/26/c4/97ecde8b1e74f67b8598c57c6fccf6df86ea7861ed29da84629cdbba76c4/jiter-0.13.0-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1211427574b17b633cfceba5040de8081e5abf114f7a7602f73d2e16f9fdaa59", size = 360309, upload-time = "2026-02-02T12:37:13.244Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/d7/eabe3cf46715854ccc80be2cd78dd4c36aedeb30751dbf85a1d08c14373c/jiter-0.13.0-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7beae3a3d3b5212d3a55d2961db3c292e02e302feb43fce6a3f7a31b90ea6dfe", size = 389607, upload-time = "2026-02-02T12:37:14.881Z" },
+    { url = "https://files.pythonhosted.org/packages/df/2d/03963fc0804e6109b82decfb9974eb92df3797fe7222428cae12f8ccaa0c/jiter-0.13.0-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:e5562a0f0e90a6223b704163ea28e831bd3a9faa3512a711f031611e6b06c939", size = 514986, upload-time = "2026-02-02T12:37:16.326Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/6c/8c83b45eb3eb1c1e18d841fe30b4b5bc5619d781267ca9bc03e005d8fd0a/jiter-0.13.0-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:6c26a424569a59140fb51160a56df13f438a2b0967365e987889186d5fc2f6f9", size = 548756, upload-time = "2026-02-02T12:37:17.736Z" },
+    { url = "https://files.pythonhosted.org/packages/47/66/eea81dfff765ed66c68fd2ed8c96245109e13c896c2a5015c7839c92367e/jiter-0.13.0-cp314-cp314t-win32.whl", hash = "sha256:24dc96eca9f84da4131cdf87a95e6ce36765c3b156fc9ae33280873b1c32d5f6", size = 201196, upload-time = "2026-02-02T12:37:19.101Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/32/4ac9c7a76402f8f00d00842a7f6b83b284d0cf7c1e9d4227bc95aa6d17fa/jiter-0.13.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0a8d76c7524087272c8ae913f5d9d608bd839154b62c4322ef65723d2e5bb0b8", size = 204215, upload-time = "2026-02-02T12:37:20.495Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/8e/7def204fea9f9be8b3c21a6f2dd6c020cf56c7d5ff753e0e23ed7f9ea57e/jiter-0.13.0-cp314-cp314t-win_arm64.whl", hash = "sha256:2c26cf47e2cad140fa23b6d58d435a7c0161f5c514284802f25e87fddfe11024", size = 187152, upload-time = "2026-02-02T12:37:22.124Z" },
+    { url = "https://files.pythonhosted.org/packages/79/b3/3c29819a27178d0e461a8571fb63c6ae38be6dc36b78b3ec2876bbd6a910/jiter-0.13.0-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b1cbfa133241d0e6bdab48dcdc2604e8ba81512f6bbd68ec3e8e1357dd3c316c", size = 307016, upload-time = "2026-02-02T12:37:42.755Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/ae/60993e4b07b1ac5ebe46da7aa99fdbb802eb986c38d26e3883ac0125c4e0/jiter-0.13.0-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:db367d8be9fad6e8ebbac4a7578b7af562e506211036cba2c06c3b998603c3d2", size = 305024, upload-time = "2026-02-02T12:37:44.774Z" },
+    { url = "https://files.pythonhosted.org/packages/77/fa/2227e590e9cf98803db2811f172b2d6460a21539ab73006f251c66f44b14/jiter-0.13.0-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45f6f8efb2f3b0603092401dc2df79fa89ccbc027aaba4174d2d4133ed661434", size = 339337, upload-time = "2026-02-02T12:37:46.668Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/92/015173281f7eb96c0ef580c997da8ef50870d4f7f4c9e03c845a1d62ae04/jiter-0.13.0-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:597245258e6ad085d064780abfb23a284d418d3e61c57362d9449c6c7317ee2d", size = 346395, upload-time = "2026-02-02T12:37:48.09Z" },
+    { url = "https://files.pythonhosted.org/packages/80/60/e50fa45dd7e2eae049f0ce964663849e897300433921198aef94b6ffa23a/jiter-0.13.0-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:3d744a6061afba08dd7ae375dcde870cffb14429b7477e10f67e9e6d68772a0a", size = 305169, upload-time = "2026-02-02T12:37:50.376Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/73/a009f41c5eed71c49bec53036c4b33555afcdee70682a18c6f66e396c039/jiter-0.13.0-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:ff732bd0a0e778f43d5009840f20b935e79087b4dc65bd36f1cd0f9b04b8ff7f", size = 303808, upload-time = "2026-02-02T12:37:52.092Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/10/528b439290763bff3d939268085d03382471b442f212dca4ff5f12802d43/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ab44b178f7981fcaea7e0a5df20e773c663d06ffda0198f1a524e91b2fde7e59", size = 337384, upload-time = "2026-02-02T12:37:53.582Z" },
+    { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
 ]
 
 [[package]]
@@ -2248,6 +2679,7 @@ name = "langchain"
 version = "0.3.28"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "async-timeout", version = "4.0.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
     { name = "langchain-core" },
     { name = "langchain-text-splitters" },
     { name = "langsmith" },
@@ -2263,21 +2695,20 @@ wheels = [
 
 [[package]]
 name = "langchain-core"
-version = "0.3.84"
+version = "0.3.76"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "jsonpatch" },
     { name = "langsmith" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" } },
+    { name = "packaging" },
     { name = "pydantic" },
     { name = "pyyaml" },
     { name = "tenacity" },
     { name = "typing-extensions" },
-    { name = "uuid-utils" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/13/3e/1e70598fac522eaeeeb22f03107da06495160533b25ba4388be9cef01d55/langchain_core-0.3.84.tar.gz", hash = "sha256:814b75bfe67a8460a53f5839bae9505bbfffc7af6f1aa0a5155715563f5cc490", size = 599092, upload-time = "2026-04-08T19:14:00.106Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4f/4d/5e2ea7754ee0a1f524c412801c6ba9ad49318ecb58b0d524903c3d9efe0a/langchain_core-0.3.76.tar.gz", hash = "sha256:71136a122dd1abae2c289c5809d035cf12b5f2bb682d8a4c1078cd94feae7419", size = 573568, upload-time = "2025-09-10T14:49:39.863Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8d/5b/ba75d5b80bd1f60ae799c8cbda5477eb7489fb21d40c967ec509bbd51933/langchain_core-0.3.84-py3-none-any.whl", hash = "sha256:d0b3a7b6473e30a2b3d4588ee09dc6471b8d38c46cd48f3e7c3d1ab6547f63cb", size = 459123, upload-time = "2026-04-08T19:13:57.818Z" },
+    { url = "https://files.pythonhosted.org/packages/77/b5/501c0ffcb09c734457ceaa86bc7b1dd37b6a261147bd653add03b838aacb/langchain_core-0.3.76-py3-none-any.whl", hash = "sha256:46e0eb48c7ac532432d51f8ca1ece1804c82afe9ae3dcf027b867edadf82b3ec", size = 447508, upload-time = "2025-09-10T14:49:38.179Z" },
 ]
 
 [[package]]
@@ -2294,20 +2725,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/de/46/d7529004de384b2abc9e5b76cf4a84a23f3028ec6381bd5f7c00ac39bfab/langchain_nvidia_ai_endpoints-0.3.19-py3-none-any.whl", hash = "sha256:40161a71646fcbe457ac5f2222c5eadcbe31a7d79d618f5a0857c37fffa3a6d5", size = 46229, upload-time = "2025-10-31T00:17:18.306Z" },
 ]
 
-[[package]]
-name = "langchain-openai"
-version = "0.3.35"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "langchain-core" },
-    { name = "openai" },
-    { name = "tiktoken" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/fb/96/06d0d25a37e05a0ff2d918f0a4b0bf0732aed6a43b472b0b68426ce04ef8/langchain_openai-0.3.35.tar.gz", hash = "sha256:fa985fd041c3809da256a040c98e8a43e91c6d165b96dcfeb770d8bd457bf76f", size = 786635, upload-time = "2025-10-06T15:09:28.463Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d8/d5/c90c5478215c20ee71d8feaf676f7ffd78d0568f8c98bd83f81ce7562ed7/langchain_openai-0.3.35-py3-none-any.whl", hash = "sha256:76d5707e6e81fd461d33964ad618bd326cb661a1975cef7c1cb0703576bdada5", size = 75952, upload-time = "2025-10-06T15:09:27.137Z" },
-]
-
 [[package]]
 name = "langchain-text-splitters"
 version = "0.3.11"
@@ -2322,12 +2739,12 @@ wheels = [
 
 [[package]]
 name = "langsmith"
-version = "0.7.32"
+version = "0.7.31"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "httpx" },
     { name = "orjson", marker = "platform_python_implementation != 'PyPy'" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" } },
+    { name = "packaging" },
     { name = "pydantic" },
     { name = "requests" },
     { name = "requests-toolbelt" },
@@ -2335,9 +2752,9 @@ dependencies = [
     { name = "xxhash" },
     { name = "zstandard" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/2f/b4/a0b4a501bee6b8a741ce29f8c48155b132118483cddc6f9247735ddb38fa/langsmith-0.7.32.tar.gz", hash = "sha256:b59b8e106d0e4c4842e158229296086e2aa7c561e3f602acda73d3ad0062e915", size = 1184518, upload-time = "2026-04-15T23:42:41.885Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e6/11/696019490992db5c87774dc20515529ef42a01e1d770fb754ed6d9b12fb0/langsmith-0.7.31.tar.gz", hash = "sha256:331ee4f7c26bb5be4022b9859b7d7b122cbf8c9d01d9f530114c1914b0349ffb", size = 1178480, upload-time = "2026-04-14T17:55:41.242Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/62/bc/148f98ac7dad73ac5e1b1c985290079cfeeb9ba13d760a24f25002beb2c9/langsmith-0.7.32-py3-none-any.whl", hash = "sha256:e1fde928990c4c52f47dc5132708cec674355d9101723d564183e965f383bf5f", size = 378272, upload-time = "2026-04-15T23:42:39.905Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/a1/a013cf458c301cda86a213dd153ce0a01c93f1ab5833f951e6a44c9763ce/langsmith-0.7.31-py3-none-any.whl", hash = "sha256:0291d49203f6e80dda011af1afda61eb0595a4d697adb684590a8805e1d61fb6", size = 373276, upload-time = "2026-04-14T17:55:39.677Z" },
 ]
 
 [[package]]
@@ -2360,7 +2777,7 @@ wheels = [
 
 [[package]]
 name = "leptonai"
-version = "0.27.1"
+version = "0.27.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -2385,7 +2802,7 @@ dependencies = [
     { name = "uvicorn" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f9/b9/6879fc6bb9241c7a9160de6de04d749ed43e65d65c639c06a232baf3e46e/leptonai-0.27.1-py3-none-any.whl", hash = "sha256:34ccb3936ee8bff67c856886b231d48537d18fa98a23dc54a7f30315f1140d12", size = 2480277, upload-time = "2026-04-16T00:56:20.712Z" },
+    { url = "https://files.pythonhosted.org/packages/88/80/281af82242d9e20e9c0b19fb35c2a7a6df728b14f25483271d8169ef0a9a/leptonai-0.27.0-py3-none-any.whl", hash = "sha256:2a83d77a3bfcd86b877483ab503b4cde970b0a4d4143535510dac67d565fc1a4", size = 2476455, upload-time = "2026-01-17T03:31:56.977Z" },
 ]
 
 [[package]]
@@ -2398,6 +2815,22 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/de/cd/337df968b38d94c5aabd3e1b10630f047a2b345f6e1d4456bd9fe7417537/libcst-1.8.6.tar.gz", hash = "sha256:f729c37c9317126da9475bdd06a7208eb52fcbd180a6341648b45a56b4ba708b", size = 891354, upload-time = "2025-11-03T22:33:30.621Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/52/97d5454dee9d014821fe0c88f3dc0e83131b97dd074a4d49537056a75475/libcst-1.8.6-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:a20c5182af04332cc94d8520792befda06d73daf2865e6dddc5161c72ea92cb9", size = 2211698, upload-time = "2025-11-03T22:31:50.117Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/a4/d1205985d378164687af3247a9c8f8bdb96278b0686ac98ab951bc6d336a/libcst-1.8.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:36473e47cb199b7e6531d653ee6ffed057de1d179301e6c67f651f3af0b499d6", size = 2093104, upload-time = "2025-11-03T22:31:52.189Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/de/1338da681b7625b51e584922576d54f1b8db8fc7ff4dc79121afc5d4d2cd/libcst-1.8.6-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:06fc56335a45d61b7c1b856bfab4587b84cfe31e9d6368f60bb3c9129d900f58", size = 2237419, upload-time = "2025-11-03T22:31:53.526Z" },
+    { url = "https://files.pythonhosted.org/packages/50/06/ee66f2d83b870534756e593d464d8b33b0914c224dff3a407e0f74dc04e0/libcst-1.8.6-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6b23d14a7fc0addd9795795763af26b185deb7c456b1e7cc4d5228e69dab5ce8", size = 2300820, upload-time = "2025-11-03T22:31:55.995Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/ca/959088729de8e0eac8dd516e4fb8623d8d92bad539060fa85c9e94d418a5/libcst-1.8.6-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:16cfe0cfca5fd840e1fb2c30afb628b023d3085b30c3484a79b61eae9d6fe7ba", size = 2301201, upload-time = "2025-11-03T22:31:57.347Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/4c/2a21a8c452436097dfe1da277f738c3517f3f728713f16d84b9a3d67ca8d/libcst-1.8.6-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:455f49a93aea4070132c30ebb6c07c2dea0ba6c1fde5ffde59fc45dbb9cfbe4b", size = 2408213, upload-time = "2025-11-03T22:31:59.221Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/26/8f7b671fad38a515bb20b038718fd2221ab658299119ac9bcec56c2ced27/libcst-1.8.6-cp310-cp310-win_amd64.whl", hash = "sha256:72cca15800ffc00ba25788e4626189fe0bc5fe2a0c1cb4294bce2e4df21cc073", size = 2119189, upload-time = "2025-11-03T22:32:00.696Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/bf/ffb23a48e27001165cc5c81c5d9b3d6583b21b7f5449109e03a0020b060c/libcst-1.8.6-cp310-cp310-win_arm64.whl", hash = "sha256:6cad63e3a26556b020b634d25a8703b605c0e0b491426b3e6b9e12ed20f09100", size = 2001736, upload-time = "2025-11-03T22:32:02.986Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/15/95c2ecadc0fb4af8a7057ac2012a4c0ad5921b9ef1ace6c20006b56d3b5f/libcst-1.8.6-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:3649a813660fbffd7bc24d3f810b1f75ac98bd40d9d6f56d1f0ee38579021073", size = 2211289, upload-time = "2025-11-03T22:32:04.673Z" },
+    { url = "https://files.pythonhosted.org/packages/80/c3/7e1107acd5ed15cf60cc07c7bb64498a33042dc4821874aea3ec4942f3cd/libcst-1.8.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:0cbe17067055829607c5ba4afa46bfa4d0dd554c0b5a583546e690b7367a29b6", size = 2092927, upload-time = "2025-11-03T22:32:06.209Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/ff/0d2be87f67e2841a4a37d35505e74b65991d30693295c46fc0380ace0454/libcst-1.8.6-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:59a7e388c57d21d63722018978a8ddba7b176e3a99bd34b9b84a576ed53f2978", size = 2237002, upload-time = "2025-11-03T22:32:07.559Z" },
+    { url = "https://files.pythonhosted.org/packages/69/99/8c4a1b35c7894ccd7d33eae01ac8967122f43da41325223181ca7e4738fe/libcst-1.8.6-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:b6c1248cc62952a3a005792b10cdef2a4e130847be9c74f33a7d617486f7e532", size = 2301048, upload-time = "2025-11-03T22:32:08.869Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/8b/d1aa811eacf936cccfb386ae0585aa530ea1221ccf528d67144e041f5915/libcst-1.8.6-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6421a930b028c5ef4a943b32a5a78b7f1bf15138214525a2088f11acbb7d3d64", size = 2300675, upload-time = "2025-11-03T22:32:10.579Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/6b/7b65cd41f25a10c1fef2389ddc5c2b2cc23dc4d648083fa3e1aa7e0eeac2/libcst-1.8.6-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6d8b67874f2188399a71a71731e1ba2d1a2c3173b7565d1cc7ffb32e8fbaba5b", size = 2407934, upload-time = "2025-11-03T22:32:11.856Z" },
+    { url = "https://files.pythonhosted.org/packages/c5/8b/401cfff374bb3b785adfad78f05225225767ee190997176b2a9da9ed9460/libcst-1.8.6-cp311-cp311-win_amd64.whl", hash = "sha256:b0d8c364c44ae343937f474b2e492c1040df96d94530377c2f9263fb77096e4f", size = 2119247, upload-time = "2025-11-03T22:32:13.279Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/17/085f59eaa044b6ff6bc42148a5449df2b7f0ba567307de7782fe85c39ee2/libcst-1.8.6-cp311-cp311-win_arm64.whl", hash = "sha256:5dcaaebc835dfe5755bc85f9b186fb7e2895dda78e805e577fef1011d51d5a5c", size = 2001774, upload-time = "2025-11-03T22:32:14.647Z" },
     { url = "https://files.pythonhosted.org/packages/0c/3c/93365c17da3d42b055a8edb0e1e99f1c60c776471db6c9b7f1ddf6a44b28/libcst-1.8.6-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0c13d5bd3d8414a129e9dccaf0e5785108a4441e9b266e1e5e9d1f82d1b943c9", size = 2206166, upload-time = "2025-11-03T22:32:16.012Z" },
     { url = "https://files.pythonhosted.org/packages/1d/cb/7530940e6ac50c6dd6022349721074e19309eb6aa296e942ede2213c1a19/libcst-1.8.6-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f1472eeafd67cdb22544e59cf3bfc25d23dc94058a68cf41f6654ff4fcb92e09", size = 2083726, upload-time = "2025-11-03T22:32:17.312Z" },
     { url = "https://files.pythonhosted.org/packages/1b/cf/7e5eaa8c8f2c54913160671575351d129170db757bb5e4b7faffed022271/libcst-1.8.6-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:089c58e75cb142ec33738a1a4ea7760a28b40c078ab2fd26b270dac7d2633a4d", size = 2235755, upload-time = "2025-11-03T22:32:18.859Z" },
@@ -2479,8 +2912,7 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "einops" },
     { name = "ninja" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "packaging" },
     { name = "setuptools" },
     { name = "torch", marker = "sys_platform == 'never'" },
     { name = "transformers" },
@@ -2497,12 +2929,93 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/de/1f/77fa3081e4f66ca3576c896ae5d31c3002ac6607f9747d2e3aa49227e464/markdown-3.10.2-py3-none-any.whl", hash = "sha256:e91464b71ae3ee7afd3017d9f358ef0baf158fd9a298db92f1d4761133824c36", size = 108180, upload-time = "2026-02-09T14:57:25.787Z" },
 ]
 
+[[package]]
+name = "markdown-it-py"
+version = "3.0.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+dependencies = [
+    { name = "mdurl", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/38/71/3b932df36c1a044d397a1f92d1cf91ee0a503d91e470cbd670aa66b07ed0/markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb", size = 74596, upload-time = "2023-06-03T06:41:14.443Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/42/d7/1ec15b46af6af88f19b8e5ffea08fa375d433c998b8a7639e76935c14f1f/markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1", size = 87528, upload-time = "2023-06-03T06:41:11.019Z" },
+]
+
 [[package]]
 name = "markdown-it-py"
 version = "4.0.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
 dependencies = [
-    { name = "mdurl" },
+    { name = "mdurl", marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070, upload-time = "2025-08-11T12:57:52.854Z" }
 wheels = [
@@ -2515,6 +3028,28 @@ version = "3.0.3"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313, upload-time = "2025-09-27T18:37:40.426Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/e8/4b/3541d44f3937ba468b75da9eebcae497dcf67adb65caa16760b0a6807ebb/markupsafe-3.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f981d352f04553a7171b8e44369f2af4055f888dfb147d55e42d29e29e74559", size = 11631, upload-time = "2025-09-27T18:36:05.558Z" },
+    { url = "https://files.pythonhosted.org/packages/98/1b/fbd8eed11021cabd9226c37342fa6ca4e8a98d8188a8d9b66740494960e4/markupsafe-3.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c1493fb6e50ab01d20a22826e57520f1284df32f2d8601fdd90b6304601419", size = 12057, upload-time = "2025-09-27T18:36:07.165Z" },
+    { url = "https://files.pythonhosted.org/packages/40/01/e560d658dc0bb8ab762670ece35281dec7b6c1b33f5fbc09ebb57a185519/markupsafe-3.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1ba88449deb3de88bd40044603fafffb7bc2b055d626a330323a9ed736661695", size = 22050, upload-time = "2025-09-27T18:36:08.005Z" },
+    { url = "https://files.pythonhosted.org/packages/af/cd/ce6e848bbf2c32314c9b237839119c5a564a59725b53157c856e90937b7a/markupsafe-3.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f42d0984e947b8adf7dd6dde396e720934d12c506ce84eea8476409563607591", size = 20681, upload-time = "2025-09-27T18:36:08.881Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/2a/b5c12c809f1c3045c4d580b035a743d12fcde53cf685dbc44660826308da/markupsafe-3.0.3-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c0c0b3ade1c0b13b936d7970b1d37a57acde9199dc2aecc4c336773e1d86049c", size = 20705, upload-time = "2025-09-27T18:36:10.131Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/e3/9427a68c82728d0a88c50f890d0fc072a1484de2f3ac1ad0bfc1a7214fd5/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:0303439a41979d9e74d18ff5e2dd8c43ed6c6001fd40e5bf2e43f7bd9bbc523f", size = 21524, upload-time = "2025-09-27T18:36:11.324Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/36/23578f29e9e582a4d0278e009b38081dbe363c5e7165113fad546918a232/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d2ee202e79d8ed691ceebae8e0486bd9a2cd4794cec4824e1c99b6f5009502f6", size = 20282, upload-time = "2025-09-27T18:36:12.573Z" },
+    { url = "https://files.pythonhosted.org/packages/56/21/dca11354e756ebd03e036bd8ad58d6d7168c80ce1fe5e75218e4945cbab7/markupsafe-3.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:177b5253b2834fe3678cb4a5f0059808258584c559193998be2601324fdeafb1", size = 20745, upload-time = "2025-09-27T18:36:13.504Z" },
+    { url = "https://files.pythonhosted.org/packages/87/99/faba9369a7ad6e4d10b6a5fbf71fa2a188fe4a593b15f0963b73859a1bbd/markupsafe-3.0.3-cp310-cp310-win32.whl", hash = "sha256:2a15a08b17dd94c53a1da0438822d70ebcd13f8c3a95abe3a9ef9f11a94830aa", size = 14571, upload-time = "2025-09-27T18:36:14.779Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/25/55dc3ab959917602c96985cb1253efaa4ff42f71194bddeb61eb7278b8be/markupsafe-3.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:c4ffb7ebf07cfe8931028e3e4c85f0357459a3f9f9490886198848f4fa002ec8", size = 15056, upload-time = "2025-09-27T18:36:16.125Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/9e/0a02226640c255d1da0b8d12e24ac2aa6734da68bff14c05dd53b94a0fc3/markupsafe-3.0.3-cp310-cp310-win_arm64.whl", hash = "sha256:e2103a929dfa2fcaf9bb4e7c091983a49c9ac3b19c9061b6d5427dd7d14d81a1", size = 13932, upload-time = "2025-09-27T18:36:17.311Z" },
+    { url = "https://files.pythonhosted.org/packages/08/db/fefacb2136439fc8dd20e797950e749aa1f4997ed584c62cfb8ef7c2be0e/markupsafe-3.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1cc7ea17a6824959616c525620e387f6dd30fec8cb44f649e31712db02123dad", size = 11631, upload-time = "2025-09-27T18:36:18.185Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/2e/5898933336b61975ce9dc04decbc0a7f2fee78c30353c5efba7f2d6ff27a/markupsafe-3.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd4cd07944443f5a265608cc6aab442e4f74dff8088b0dfc8238647b8f6ae9a", size = 12058, upload-time = "2025-09-27T18:36:19.444Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/09/adf2df3699d87d1d8184038df46a9c80d78c0148492323f4693df54e17bb/markupsafe-3.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6b5420a1d9450023228968e7e6a9ce57f65d148ab56d2313fcd589eee96a7a50", size = 24287, upload-time = "2025-09-27T18:36:20.768Z" },
+    { url = "https://files.pythonhosted.org/packages/30/ac/0273f6fcb5f42e314c6d8cd99effae6a5354604d461b8d392b5ec9530a54/markupsafe-3.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0bf2a864d67e76e5c9a34dc26ec616a66b9888e25e7b9460e1c76d3293bd9dbf", size = 22940, upload-time = "2025-09-27T18:36:22.249Z" },
+    { url = "https://files.pythonhosted.org/packages/19/ae/31c1be199ef767124c042c6c3e904da327a2f7f0cd63a0337e1eca2967a8/markupsafe-3.0.3-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:bc51efed119bc9cfdf792cdeaa4d67e8f6fcccab66ed4bfdd6bde3e59bfcbb2f", size = 21887, upload-time = "2025-09-27T18:36:23.535Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/76/7edcab99d5349a4532a459e1fe64f0b0467a3365056ae550d3bcf3f79e1e/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:068f375c472b3e7acbe2d5318dea141359e6900156b5b2ba06a30b169086b91a", size = 23692, upload-time = "2025-09-27T18:36:24.823Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/28/6e74cdd26d7514849143d69f0bf2399f929c37dc2b31e6829fd2045b2765/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:7be7b61bb172e1ed687f1754f8e7484f1c8019780f6f6b0786e76bb01c2ae115", size = 21471, upload-time = "2025-09-27T18:36:25.95Z" },
+    { url = "https://files.pythonhosted.org/packages/62/7e/a145f36a5c2945673e590850a6f8014318d5577ed7e5920a4b3448e0865d/markupsafe-3.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9e130248f4462aaa8e2552d547f36ddadbeaa573879158d721bbd33dfe4743a", size = 22923, upload-time = "2025-09-27T18:36:27.109Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/62/d9c46a7f5c9adbeeeda52f5b8d802e1094e9717705a645efc71b0913a0a8/markupsafe-3.0.3-cp311-cp311-win32.whl", hash = "sha256:0db14f5dafddbb6d9208827849fad01f1a2609380add406671a26386cdf15a19", size = 14572, upload-time = "2025-09-27T18:36:28.045Z" },
+    { url = "https://files.pythonhosted.org/packages/83/8a/4414c03d3f891739326e1783338e48fb49781cc915b2e0ee052aa490d586/markupsafe-3.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:de8a88e63464af587c950061a5e6a67d3632e36df62b986892331d4620a35c01", size = 15077, upload-time = "2025-09-27T18:36:29.025Z" },
+    { url = "https://files.pythonhosted.org/packages/35/73/893072b42e6862f319b5207adc9ae06070f095b358655f077f69a35601f0/markupsafe-3.0.3-cp311-cp311-win_arm64.whl", hash = "sha256:3b562dd9e9ea93f13d53989d23a7e775fdfd1066c33494ff43f5418bc8c58a5c", size = 13876, upload-time = "2025-09-27T18:36:29.954Z" },
     { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615, upload-time = "2025-09-27T18:36:30.854Z" },
     { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020, upload-time = "2025-09-27T18:36:31.971Z" },
     { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332, upload-time = "2025-09-27T18:36:32.813Z" },
@@ -2611,7 +3146,8 @@ name = "mdit-py-plugins"
 version = "0.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markdown-it-py" },
+    { name = "markdown-it-py", version = "3.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "markdown-it-py", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b2/fd/a756d36c0bfba5f6e39a1cdbdbfdd448dc02692467d83816dff4592a1ebc/mdit_py_plugins-0.5.0.tar.gz", hash = "sha256:f4918cb50119f50446560513a8e311d574ff6aaed72606ddae6d35716fe809c6", size = 44655, upload-time = "2025-08-11T07:25:49.083Z" }
 wheels = [
@@ -2631,10 +3167,9 @@ wheels = [
 name = "megatron-core"
 source = { editable = "." }
 dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging" },
     { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 
@@ -2642,10 +3177,9 @@ dependencies = [
 dev = [
     { name = "av" },
     { name = "causal-conv1d" },
-    { name = "datasets", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "datasets", version = "4.8.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "datasets" },
     { name = "einops" },
-    { name = "emerging-optimizers" },
+    { name = "emerging-optimizers", marker = "python_full_version >= '3.12'" },
     { name = "fastapi" },
     { name = "flash-linear-attention" },
     { name = "flashinfer-python" },
@@ -2653,6 +3187,7 @@ dev = [
     { name = "mamba-ssm" },
     { name = "megatron-energon", extra = ["av-decode"], marker = "extra == 'extra-13-megatron-core-dev'" },
     { name = "multi-storage-client" },
+    { name = "nv-grouped-gemm" },
     { name = "nvidia-modelopt", marker = "(sys_platform != 'darwin' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-resiliency-ext" },
     { name = "nvtx" },
@@ -2660,10 +3195,9 @@ dev = [
     { name = "onnxscript", version = "0.6.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "openai", extra = ["aiohttp"], marker = "extra == 'extra-13-megatron-core-dev'" },
     { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" } },
-    { name = "orjson" },
     { name = "quart" },
-    { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "tensorstore", version = "0.1.82", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tensorstore", version = "0.1.82", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "tqdm" },
     { name = "transformer-engine", marker = "extra == 'extra-13-megatron-core-dev'" },
     { name = "wget" },
@@ -2671,19 +3205,20 @@ dev = [
 lts = [
     { name = "av" },
     { name = "causal-conv1d" },
-    { name = "datasets", version = "2.2.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "datasets", version = "4.8.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "datasets" },
     { name = "einops" },
-    { name = "emerging-optimizers" },
+    { name = "emerging-optimizers", marker = "python_full_version >= '3.12'" },
     { name = "fastapi" },
     { name = "flashinfer-python" },
     { name = "mamba-ssm" },
     { name = "megatron-energon", extra = ["av-decode"], marker = "extra == 'extra-13-megatron-core-lts'" },
     { name = "multi-storage-client" },
+    { name = "nv-grouped-gemm" },
     { name = "nvtx" },
     { name = "onnxscript", version = "0.6.2", source = { registry = "https://pypi.org/simple" } },
     { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" } },
-    { name = "tensorstore", version = "0.1.82", source = { registry = "https://pypi.org/simple" } },
+    { name = "tensorstore", version = "0.1.74", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tensorstore", version = "0.1.82", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "tqdm" },
     { name = "wget" },
 ]
@@ -2709,23 +3244,26 @@ build = [
     { name = "cython" },
     { name = "hatchling" },
     { name = "nvidia-mathdx" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "packaging" },
     { name = "pybind11" },
     { name = "setuptools" },
     { name = "torch", marker = "sys_platform == 'never' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 ci = [
-    { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "pandas", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "pandas", version = "2.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "pandas", version = "3.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "python-gitlab" },
     { name = "slack-sdk" },
 ]
 docs = [
-    { name = "myst-parser" },
+    { name = "myst-parser", version = "4.0.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "myst-parser", version = "5.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-sphinx-theme" },
-    { name = "sphinx" },
-    { name = "sphinx-autobuild" },
+    { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx-autobuild", version = "2024.10.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx-autobuild", version = "2025.8.25", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "sphinx-autodoc2" },
     { name = "sphinx-copybutton" },
 ]
@@ -2737,12 +3275,11 @@ linting = [
     { name = "ruff" },
 ]
 no-pypi-wheels = [
-    { name = "emerging-optimizers" },
-    { name = "flash-mla" },
+    { name = "emerging-optimizers", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "fast-hadamard-transform" },
 ]
 test = [
     { name = "coverage" },
-    { name = "mock" },
     { name = "nemo-run" },
     { name = "nltk" },
     { name = "pydantic" },
@@ -2769,8 +3306,8 @@ requires-dist = [
     { name = "datasets", marker = "extra == 'lts'" },
     { name = "einops", marker = "extra == 'dev'", specifier = "~=0.8" },
     { name = "einops", marker = "extra == 'lts'", specifier = "~=0.8" },
-    { name = "emerging-optimizers", marker = "extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.2.0" },
-    { name = "emerging-optimizers", marker = "extra == 'lts'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.2.0" },
+    { name = "emerging-optimizers", marker = "python_full_version >= '3.12' and extra == 'dev'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.2.0" },
+    { name = "emerging-optimizers", marker = "python_full_version >= '3.12' and extra == 'lts'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.2.0" },
     { name = "fastapi", marker = "extra == 'dev'", specifier = "~=0.50" },
     { name = "fastapi", marker = "extra == 'lts'", specifier = "~=0.50" },
     { name = "flash-linear-attention", marker = "extra == 'dev'", specifier = "~=0.4.0" },
@@ -2786,8 +3323,10 @@ requires-dist = [
     { name = "multi-storage-client", marker = "extra == 'dev'", specifier = "~=0.27" },
     { name = "multi-storage-client", marker = "extra == 'lts'", specifier = "~=0.27" },
     { name = "numpy" },
+    { name = "nv-grouped-gemm", marker = "extra == 'dev'", specifier = "~=1.1" },
+    { name = "nv-grouped-gemm", marker = "extra == 'lts'", specifier = "~=1.1" },
     { name = "nvidia-modelopt", extras = ["torch"], marker = "sys_platform != 'darwin' and extra == 'dev'" },
-    { name = "nvidia-resiliency-ext", marker = "extra == 'dev'", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=b2bb3d728a18795807d9f76c535e005a609a1b01" },
+    { name = "nvidia-resiliency-ext", marker = "extra == 'dev'", git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=15a851565a4ce846c04431ecb0cf09903ab4837e" },
     { name = "nvtx", marker = "extra == 'dev'", specifier = "~=0.2" },
     { name = "nvtx", marker = "extra == 'lts'", specifier = "~=0.2" },
     { name = "onnxscript", marker = "extra == 'dev'" },
@@ -2795,7 +3334,6 @@ requires-dist = [
     { name = "openai", extras = ["aiohttp"], marker = "extra == 'dev'" },
     { name = "opentelemetry-api", marker = "extra == 'dev'", specifier = "~=1.33.1" },
     { name = "opentelemetry-api", marker = "extra == 'lts'", specifier = "~=1.33.1" },
-    { name = "orjson", marker = "extra == 'dev'" },
     { name = "packaging", specifier = ">=24.2" },
     { name = "quart", marker = "extra == 'dev'" },
     { name = "sentencepiece", marker = "extra == 'mlm'" },
@@ -2848,12 +3386,11 @@ linting = [
     { name = "ruff", specifier = "~=0.9.0" },
 ]
 no-pypi-wheels = [
-    { name = "emerging-optimizers", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.2.0" },
-    { name = "flash-mla", git = "https://github.com/deepseek-ai/FlashMLA?rev=9edee0c022cd0938148a18e334203b0aab43aa19" },
+    { name = "emerging-optimizers", marker = "python_full_version >= '3.12'", git = "https://github.com/NVIDIA-NeMo/Emerging-Optimizers.git?rev=v0.2.0" },
+    { name = "fast-hadamard-transform", git = "https://github.com/Dao-AILab/fast-hadamard-transform.git?rev=f134af63deb2df17e1171a9ec1ea4a7d8604d5ca" },
 ]
 test = [
     { name = "coverage" },
-    { name = "mock" },
     { name = "nemo-run", git = "https://github.com/NVIDIA-NeMo/Run.git?rev=17ae86b64d7f75653351664f5d8c9e466faede00" },
     { name = "nltk" },
     { name = "pydantic" },
@@ -2876,12 +3413,11 @@ dependencies = [
     { name = "braceexpand" },
     { name = "click" },
     { name = "multi-storage-client" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pillow" },
     { name = "pyyaml" },
-    { name = "s3fs", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "s3fs", version = "2026.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev') or (python_full_version >= '3.14' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "s3fs" },
     { name = "torch", marker = "sys_platform == 'never'" },
     { name = "tqdm" },
     { name = "webdataset" },
@@ -2920,10 +3456,18 @@ resolution-markers = [
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
 ]
 dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fd/15/76f86faa0902836cc133939732f7611ace68cf54148487a99c539c272dc8/ml_dtypes-0.4.1.tar.gz", hash = "sha256:fad5f2de464fd09127e49b7fd1252b9006fb43d2edc1ff112d390c324af5ca7a", size = 692594, upload-time = "2024-09-13T19:07:11.624Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/56/9e/76b84f77c7afee3b116dc8407903a2d5004ba3059a8f3dcdcfa6ebf33fff/ml_dtypes-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1fe8b5b5e70cd67211db94b05cfd58dace592f24489b038dc6f9fe347d2e07d5", size = 397975, upload-time = "2024-09-13T19:06:44.265Z" },
+    { url = "https://files.pythonhosted.org/packages/03/7b/32650e1b2a2713a5923a0af2a8503d0d4a8fc99d1e1e0a1c40e996634460/ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8c09a6d11d8475c2a9fd2bc0695628aec105f97cab3b3a3fb7c9660348ff7d24", size = 2182570, upload-time = "2024-09-13T19:06:46.189Z" },
+    { url = "https://files.pythonhosted.org/packages/16/86/a9f7569e7e4f5395f927de38a13b92efa73f809285d04f2923b291783dd2/ml_dtypes-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f5e8f75fa371020dd30f9196e7d73babae2abd51cf59bdd56cb4f8de7e13354", size = 2160365, upload-time = "2024-09-13T19:06:48.198Z" },
+    { url = "https://files.pythonhosted.org/packages/04/1b/9a3afb437702503514f3934ec8d7904270edf013d28074f3e700e5dfbb0f/ml_dtypes-0.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:15fdd922fea57e493844e5abb930b9c0bd0af217d9edd3724479fc3d7ce70e3f", size = 126633, upload-time = "2024-09-13T19:06:50.656Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/76/9835c8609c29f2214359e88f29255fc4aad4ea0f613fb48aa8815ceda1b6/ml_dtypes-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2d55b588116a7085d6e074cf0cdb1d6fa3875c059dddc4d2c94a4cc81c23e975", size = 397973, upload-time = "2024-09-13T19:06:51.748Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/99/e68c56fac5de973007a10254b6e17a0362393724f40f66d5e4033f4962c2/ml_dtypes-0.4.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e138a9b7a48079c900ea969341a5754019a1ad17ae27ee330f7ebf43f23877f9", size = 2185134, upload-time = "2024-09-13T19:06:53.197Z" },
+    { url = "https://files.pythonhosted.org/packages/28/bc/6a2344338ea7b61cd7b46fb24ec459360a5a0903b57c55b156c1e46c644a/ml_dtypes-0.4.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:74c6cfb5cf78535b103fde9ea3ded8e9f16f75bc07789054edc7776abfb3d752", size = 2163661, upload-time = "2024-09-13T19:06:54.519Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/d3/ddfd9878b223b3aa9a930c6100a99afca5cfab7ea703662e00323acb7568/ml_dtypes-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:274cc7193dd73b35fb26bef6c5d40ae3eb258359ee71cd82f6e96a8c948bdaa6", size = 126727, upload-time = "2024-09-13T19:06:55.897Z" },
     { url = "https://files.pythonhosted.org/packages/ba/1a/99e924f12e4b62139fbac87419698c65f956d58de0dbfa7c028fa5b096aa/ml_dtypes-0.4.1-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:827d3ca2097085cf0355f8fdf092b888890bb1b1455f52801a2d7756f056f54b", size = 405077, upload-time = "2024-09-13T19:06:57.538Z" },
     { url = "https://files.pythonhosted.org/packages/8f/8c/7b610bd500617854c8cc6ed7c8cfb9d48d6a5c21a1437a36a4b9bc8a3598/ml_dtypes-0.4.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:772426b08a6172a891274d581ce58ea2789cc8abc1c002a27223f314aaf894e7", size = 2181554, upload-time = "2024-09-13T19:06:59.196Z" },
     { url = "https://files.pythonhosted.org/packages/c7/c6/f89620cecc0581dc1839e218c4315171312e46c62a62da6ace204bda91c0/ml_dtypes-0.4.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:126e7d679b8676d1a958f2651949fbfa182832c3cd08020d8facd94e4114f3e9", size = 2160488, upload-time = "2024-09-13T19:07:03.131Z" },
@@ -2943,23 +3487,40 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
 ]
 dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314, upload-time = "2025-11-17T22:32:31.031Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/fe/3a/c5b855752a70267ff729c349e650263adb3c206c29d28cc8ea7ace30a1d5/ml_dtypes-0.5.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b95e97e470fe60ed493fd9ae3911d8da4ebac16bd21f87ffa2b7c588bf22ea2c", size = 679735, upload-time = "2025-11-17T22:31:31.367Z" },
+    { url = "https://files.pythonhosted.org/packages/41/79/7433f30ee04bd4faa303844048f55e1eb939131c8e5195a00a96a0939b64/ml_dtypes-0.5.4-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b4b801ebe0b477be666696bda493a9be8356f1f0057a57f1e35cd26928823e5a", size = 5051883, upload-time = "2025-11-17T22:31:33.658Z" },
+    { url = "https://files.pythonhosted.org/packages/10/b1/8938e8830b0ee2e167fc75a094dea766a1152bde46752cd9bfc57ee78a82/ml_dtypes-0.5.4-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:388d399a2152dd79a3f0456a952284a99ee5c93d3e2f8dfe25977511e0515270", size = 5030369, upload-time = "2025-11-17T22:31:35.595Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/a3/51886727bd16e2f47587997b802dd56398692ce8c6c03c2e5bb32ecafe26/ml_dtypes-0.5.4-cp310-cp310-win_amd64.whl", hash = "sha256:4ff7f3e7ca2972e7de850e7b8fcbb355304271e2933dd90814c1cb847414d6e2", size = 210738, upload-time = "2025-11-17T22:31:37.43Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/5e/712092cfe7e5eb667b8ad9ca7c54442f21ed7ca8979745f1000e24cf8737/ml_dtypes-0.5.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6c7ecb74c4bd71db68a6bea1edf8da8c34f3d9fe218f038814fd1d310ac76c90", size = 679734, upload-time = "2025-11-17T22:31:39.223Z" },
+    { url = "https://files.pythonhosted.org/packages/4f/cf/912146dfd4b5c0eea956836c01dcd2fce6c9c844b2691f5152aca196ce4f/ml_dtypes-0.5.4-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc11d7e8c44a65115d05e2ab9989d1e045125d7be8e05a071a48bc76eb6d6040", size = 5056165, upload-time = "2025-11-17T22:31:41.071Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/80/19189ea605017473660e43762dc853d2797984b3c7bf30ce656099add30c/ml_dtypes-0.5.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:19b9a53598f21e453ea2fbda8aa783c20faff8e1eeb0d7ab899309a0053f1483", size = 5034975, upload-time = "2025-11-17T22:31:42.758Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/24/70bd59276883fdd91600ca20040b41efd4902a923283c4d6edcb1de128d2/ml_dtypes-0.5.4-cp311-cp311-win_amd64.whl", hash = "sha256:7c23c54a00ae43edf48d44066a7ec31e05fdc2eee0be2b8b50dd1903a1db94bb", size = 210742, upload-time = "2025-11-17T22:31:44.068Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/c9/64230ef14e40aa3f1cb254ef623bf812735e6bec7772848d19131111ac0d/ml_dtypes-0.5.4-cp311-cp311-win_arm64.whl", hash = "sha256:557a31a390b7e9439056644cb80ed0735a6e3e3bb09d67fd5687e4b04238d1de", size = 160709, upload-time = "2025-11-17T22:31:46.557Z" },
     { url = "https://files.pythonhosted.org/packages/a8/b8/3c70881695e056f8a32f8b941126cf78775d9a4d7feba8abcb52cb7b04f2/ml_dtypes-0.5.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a174837a64f5b16cab6f368171a1a03a27936b31699d167684073ff1c4237dac", size = 676927, upload-time = "2025-11-17T22:31:48.182Z" },
     { url = "https://files.pythonhosted.org/packages/54/0f/428ef6881782e5ebb7eca459689448c0394fa0a80bea3aa9262cba5445ea/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a7f7c643e8b1320fd958bf098aa7ecf70623a42ec5154e3be3be673f4c34d900", size = 5028464, upload-time = "2025-11-17T22:31:50.135Z" },
     { url = "https://files.pythonhosted.org/packages/3a/cb/28ce52eb94390dda42599c98ea0204d74799e4d8047a0eb559b6fd648056/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ad459e99793fa6e13bd5b7e6792c8f9190b4e5a1b45c63aba14a4d0a7f1d5ff", size = 5009002, upload-time = "2025-11-17T22:31:52.001Z" },
@@ -2987,15 +3548,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ad/3f/3d42e9a78fe5edf792a83c074b13b9b770092a4fbf3462872f4303135f09/ml_dtypes-0.5.4-cp314-cp314t-win_arm64.whl", hash = "sha256:11942cbf2cf92157db91e5022633c0d9474d4dfd813a909383bd23ce828a4b7d", size = 168825, upload-time = "2025-11-17T22:32:23.766Z" },
 ]
 
-[[package]]
-name = "mock"
-version = "5.2.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/07/8c/14c2ae915e5f9dca5a22edd68b35be94400719ccfa068a03e0fb63d0f6f6/mock-5.2.0.tar.gz", hash = "sha256:4e460e818629b4b173f32d08bf30d3af8123afbb8e04bb5707a1fd4799e503f0", size = 92796, upload-time = "2025-03-03T12:31:42.911Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/bd/d9/617e6af809bf3a1d468e0d58c3997b1dc219a9a9202e650d30c2fc85d481/mock-5.2.0-py3-none-any.whl", hash = "sha256:7ba87f72ca0e915175596069dbbcc7c75af7b5e9b9bc107ad6349ede0819982f", size = 31617, upload-time = "2025-03-03T12:31:41.518Z" },
-]
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@@ -3011,6 +3563,23 @@ version = "1.1.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/4d/f2/bfb55a6236ed8725a96b0aa3acbd0ec17588e6a2c3b62a93eb513ed8783f/msgpack-1.1.2.tar.gz", hash = "sha256:3b60763c1373dd60f398488069bcdc703cd08a711477b5d480eecc9f9626f47e", size = 173581, upload-time = "2025-10-08T09:15:56.596Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/f5/a2/3b68a9e769db68668b25c6108444a35f9bd163bb848c0650d516761a59c0/msgpack-1.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0051fffef5a37ca2cd16978ae4f0aef92f164df86823871b5162812bebecd8e2", size = 81318, upload-time = "2025-10-08T09:14:38.722Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/e1/2b720cc341325c00be44e1ed59e7cfeae2678329fbf5aa68f5bda57fe728/msgpack-1.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a605409040f2da88676e9c9e5853b3449ba8011973616189ea5ee55ddbc5bc87", size = 83786, upload-time = "2025-10-08T09:14:40.082Z" },
+    { url = "https://files.pythonhosted.org/packages/71/e5/c2241de64bfceac456b140737812a2ab310b10538a7b34a1d393b748e095/msgpack-1.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b696e83c9f1532b4af884045ba7f3aa741a63b2bc22617293a2c6a7c645f251", size = 398240, upload-time = "2025-10-08T09:14:41.151Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/09/2a06956383c0fdebaef5aa9246e2356776f12ea6f2a44bd1368abf0e46c4/msgpack-1.1.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:365c0bbe981a27d8932da71af63ef86acc59ed5c01ad929e09a0b88c6294e28a", size = 406070, upload-time = "2025-10-08T09:14:42.821Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/74/2957703f0e1ef20637d6aead4fbb314330c26f39aa046b348c7edcf6ca6b/msgpack-1.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:41d1a5d875680166d3ac5c38573896453bbbea7092936d2e107214daf43b1d4f", size = 393403, upload-time = "2025-10-08T09:14:44.38Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/09/3bfc12aa90f77b37322fc33e7a8a7c29ba7c8edeadfa27664451801b9860/msgpack-1.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:354e81bcdebaab427c3df4281187edc765d5d76bfb3a7c125af9da7a27e8458f", size = 398947, upload-time = "2025-10-08T09:14:45.56Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/4f/05fcebd3b4977cb3d840f7ef6b77c51f8582086de5e642f3fefee35c86fc/msgpack-1.1.2-cp310-cp310-win32.whl", hash = "sha256:e64c8d2f5e5d5fda7b842f55dec6133260ea8f53c4257d64494c534f306bf7a9", size = 64769, upload-time = "2025-10-08T09:14:47.334Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/3e/b4547e3a34210956382eed1c85935fff7e0f9b98be3106b3745d7dec9c5e/msgpack-1.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:db6192777d943bdaaafb6ba66d44bf65aa0e9c5616fa1d2da9bb08828c6b39aa", size = 71293, upload-time = "2025-10-08T09:14:48.665Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/97/560d11202bcd537abca693fd85d81cebe2107ba17301de42b01ac1677b69/msgpack-1.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2e86a607e558d22985d856948c12a3fa7b42efad264dca8a3ebbcfa2735d786c", size = 82271, upload-time = "2025-10-08T09:14:49.967Z" },
+    { url = "https://files.pythonhosted.org/packages/83/04/28a41024ccbd67467380b6fb440ae916c1e4f25e2cd4c63abe6835ac566e/msgpack-1.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:283ae72fc89da59aa004ba147e8fc2f766647b1251500182fac0350d8af299c0", size = 84914, upload-time = "2025-10-08T09:14:50.958Z" },
+    { url = "https://files.pythonhosted.org/packages/71/46/b817349db6886d79e57a966346cf0902a426375aadc1e8e7a86a75e22f19/msgpack-1.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:61c8aa3bd513d87c72ed0b37b53dd5c5a0f58f2ff9f26e1555d3bd7948fb7296", size = 416962, upload-time = "2025-10-08T09:14:51.997Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e0/6cc2e852837cd6086fe7d8406af4294e66827a60a4cf60b86575a4a65ca8/msgpack-1.1.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:454e29e186285d2ebe65be34629fa0e8605202c60fbc7c4c650ccd41870896ef", size = 426183, upload-time = "2025-10-08T09:14:53.477Z" },
+    { url = "https://files.pythonhosted.org/packages/25/98/6a19f030b3d2ea906696cedd1eb251708e50a5891d0978b012cb6107234c/msgpack-1.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:7bc8813f88417599564fafa59fd6f95be417179f76b40325b500b3c98409757c", size = 411454, upload-time = "2025-10-08T09:14:54.648Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/cd/9098fcb6adb32187a70b7ecaabf6339da50553351558f37600e53a4a2a23/msgpack-1.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bafca952dc13907bdfdedfc6a5f579bf4f292bdd506fadb38389afa3ac5b208e", size = 422341, upload-time = "2025-10-08T09:14:56.328Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/ae/270cecbcf36c1dc85ec086b33a51a4d7d08fc4f404bdbc15b582255d05ff/msgpack-1.1.2-cp311-cp311-win32.whl", hash = "sha256:602b6740e95ffc55bfb078172d279de3773d7b7db1f703b2f1323566b878b90e", size = 64747, upload-time = "2025-10-08T09:14:57.882Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/79/309d0e637f6f37e83c711f547308b91af02b72d2326ddd860b966080ef29/msgpack-1.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:d198d275222dc54244bf3327eb8cbe00307d220241d9cec4d306d49a44e85f68", size = 71633, upload-time = "2025-10-08T09:14:59.177Z" },
+    { url = "https://files.pythonhosted.org/packages/73/4d/7c4e2b3d9b1106cd0aa6cb56cc57c6267f59fa8bfab7d91df5adc802c847/msgpack-1.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:86f8136dfa5c116365a8a651a7d7484b65b13339731dd6faebb9a0242151c406", size = 64755, upload-time = "2025-10-08T09:15:00.48Z" },
     { url = "https://files.pythonhosted.org/packages/ad/bd/8b0d01c756203fbab65d265859749860682ccd2a59594609aeec3a144efa/msgpack-1.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:70a0dff9d1f8da25179ffcf880e10cf1aad55fdb63cd59c9a49a1b82290062aa", size = 81939, upload-time = "2025-10-08T09:15:01.472Z" },
     { url = "https://files.pythonhosted.org/packages/34/68/ba4f155f793a74c1483d4bdef136e1023f7bcba557f0db4ef3db3c665cf1/msgpack-1.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:446abdd8b94b55c800ac34b102dffd2f6aa0ce643c55dfc017ad89347db3dbdb", size = 85064, upload-time = "2025-10-08T09:15:03.764Z" },
     { url = "https://files.pythonhosted.org/packages/f2/60/a064b0345fc36c4c3d2c743c82d9100c40388d77f0b48b2f04d6041dbec1/msgpack-1.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c63eea553c69ab05b6747901b97d620bb2a690633c77f23feb0c6a947a8a7b8f", size = 417131, upload-time = "2025-10-08T09:15:05.136Z" },
@@ -3051,7 +3620,7 @@ wheels = [
 
 [[package]]
 name = "multi-storage-client"
-version = "0.46.0"
+version = "0.45.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
@@ -3069,20 +3638,65 @@ dependencies = [
     { name = "xattr" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b0/da/1294bebc3a3a842ab084a3638a04b36dd094a7e5717573048b55551270fe/multi_storage_client-0.46.0-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:e6dedd4e93e1f6d7328cc616fbc4ddeb5289d4390a10783863f6ccfe00cb56dc", size = 9514199, upload-time = "2026-04-10T20:57:31.185Z" },
-    { url = "https://files.pythonhosted.org/packages/ac/dc/ec67ba28c744f80cfba44225f19ac56b0fb804cdaf8fcf4ac8c925da47e2/multi_storage_client-0.46.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4f210266ab0d118161fb79f66d1f6027b52e10c63a41af27a3300dfaea6bb1a7", size = 5765225, upload-time = "2026-04-10T20:58:19.728Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/56/f979a4f7496843328d6cf4e09b1cfa76293fe25fb7f78a60aef2dc5ecfc2/multi_storage_client-0.46.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdedff4ebc431264494fe8fe0b7ab6b4ca971f5687772adace1604788034cf75", size = 5965749, upload-time = "2026-04-10T20:56:19.951Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/71/c5fe5fbfc47218127c05e7d42511e90fe1b7c47ca3801b8339820df74388/multi_storage_client-0.46.0-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4b4817b0ed092ec536fdd89fbd1519e024054a66c340273bf2ad13d1f67d2f3f", size = 9509882, upload-time = "2026-04-10T20:57:07.779Z" },
-    { url = "https://files.pythonhosted.org/packages/5d/ce/25cfb4a854c841305a66ca3735ad1ea13c0947b963297f03afaf67f8aa42/multi_storage_client-0.46.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f47b7d64f00be68a320665d316ec0559b63b080351a7fd0c2900762bb824cfe", size = 5765904, upload-time = "2026-04-10T21:16:14.759Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/3b/0a7703ad775b2b9c41bb5560650f69d2e883f082163b0f46427826337a86/multi_storage_client-0.46.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51ff5e4313ec6df7d94dbffc3dded21cadd40293cb821efd50b4669b6498490c", size = 5965829, upload-time = "2026-04-10T20:56:42.944Z" },
+    { url = "https://files.pythonhosted.org/packages/36/1a/a7f3ed32b728f9e2f001c0cd9d1f3913e48e920e2b1ac67ff5d61f578e0d/multi_storage_client-0.45.1-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:aab4eda34792150ca774c3231937a490a0fc14c90ec97edf936cdfe10d7d23e0", size = 9012181, upload-time = "2026-03-24T17:41:44.311Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/b5/9024e6b948ba36490054d422d8f8e6dd17c3ecbcd8cbd58a25df9a74f2c1/multi_storage_client-0.45.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:23cef3c9fc8e8466e9d922ddef36531422cb7270b505219290ea6524b627fb04", size = 5397985, upload-time = "2026-03-24T17:36:50.062Z" },
+    { url = "https://files.pythonhosted.org/packages/97/4c/3e092a3f5cad3cca432ffee9141cde925d51c6333ea6efda46a1cb4d196a/multi_storage_client-0.45.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6f366d77d799536fd814e3810f50ee67712fbbf8e8a5c07c61f5501df7840df", size = 5588969, upload-time = "2026-03-24T17:40:33.869Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/17/17b9149054cb0eef8a848d5e67fb87c7464b790033752412effe458ddc7b/multi_storage_client-0.45.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:4553a8b7e7b24c9fda8e3f8e2d067e5b69eb427b8640cfed1019830097588cc0", size = 9011728, upload-time = "2026-03-24T17:36:26.763Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/a8/7312befa401395b4901ab78c3fb35a3dc6f0734ad7377eb0dc2703a3bf61/multi_storage_client-0.45.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:fa35561f5d73cdd5996a0394a6acab878b51f32e1759aec806e81ed07249c5aa", size = 5397911, upload-time = "2026-03-24T17:40:57.58Z" },
+    { url = "https://files.pythonhosted.org/packages/54/c5/2f17d5ef4bf2964ddcdfe366e5ab19ea0d25fa5f0dfdb33410624ef85625/multi_storage_client-0.45.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c60a71cd0bbdf255e14378bfd79d499bf220eca34321bf770c19d181b73208ea", size = 5589127, upload-time = "2026-03-24T17:35:27.568Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/59/b2a05f8ecfdfe3c5324b8ff7ee17434df56130e2cd5f8b35d1c5b188c4de/multi_storage_client-0.45.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:ef7dff8c27c1c092b58ba06a37aac0a2441a109202f36afc986b05e1e73f97a2", size = 9011744, upload-time = "2026-03-24T17:39:22.665Z" },
+    { url = "https://files.pythonhosted.org/packages/af/b9/d0c25e18eadc5a6f14b1c4c53b781d7f669bd310046d35dc658ab0f98b4c/multi_storage_client-0.45.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1439d6acd6957c6d4e118305e08fb029c4b36814ca9c8070dcacc741b351ba9a", size = 5395849, upload-time = "2026-03-24T17:39:46.041Z" },
+    { url = "https://files.pythonhosted.org/packages/33/9b/0134d06d66a99e1fd39c6d2c1006cbca59e04db39d339a4de0bfcaacb188/multi_storage_client-0.45.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d8960b5d661cbf79349186089f59a57a9f22703d41e99e73a17c9e0f1c2737f9", size = 5585758, upload-time = "2026-03-24T17:40:10.248Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/72/565fe8f0a6ce1639719fccc5049d9dbab4d007c94978254eca3134631b4c/multi_storage_client-0.45.1-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:9de9f980d662ee741ef8ddfd84a72b6e5ab55cd91162f5182a6ad9bf75f4be8d", size = 9008456, upload-time = "2026-03-24T17:41:20.877Z" },
+    { url = "https://files.pythonhosted.org/packages/34/b2/72ca7f3d93a147edf47fe384e71146e4df0c679537acb7fe3436c6018335/multi_storage_client-0.45.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da7285874857102b2562d2d381cde115c6cdcc3b6d0c5bc1e30c5f41a00a1518", size = 5395483, upload-time = "2026-03-24T17:38:58.779Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/00/070aec0b31cae59633beaf3e696f8ecc84ae8c084aa414346bd71b95afc1/multi_storage_client-0.45.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e62bf1442931d9923d25ee7c5067a491a431294d8e9fbf56546c9ae3e1542537", size = 5585772, upload-time = "2026-03-24T17:35:53.259Z" },
 ]
 
 [[package]]
 name = "multidict"
 version = "6.7.1"
 source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
 sdist = { url = "https://files.pythonhosted.org/packages/1a/c2/c2d94cbe6ac1753f3fc980da97b3d930efe1da3af3c9f5125354436c073d/multidict-6.7.1.tar.gz", hash = "sha256:ec6652a1bee61c53a3e5776b6049172c53b6aaba34f18c9ad04f82712bac623d", size = 102010, upload-time = "2026-01-26T02:46:45.979Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/0b/19348d4c98980c4851d2f943f8ebafdece2ae7ef737adcfa5994ce8e5f10/multidict-6.7.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:c93c3db7ea657dd4637d57e74ab73de31bccefe144d3d4ce370052035bc85fb5", size = 77176, upload-time = "2026-01-26T02:42:59.784Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/04/9de3f8077852e3d438215c81e9b691244532d2e05b4270e89ce67b7d103c/multidict-6.7.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:974e72a2474600827abaeda71af0c53d9ebbc3c2eb7da37b37d7829ae31232d8", size = 44996, upload-time = "2026-01-26T02:43:01.674Z" },
+    { url = "https://files.pythonhosted.org/packages/31/5c/08c7f7fe311f32e83f7621cd3f99d805f45519cd06fafb247628b861da7d/multidict-6.7.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cdea2e7b2456cfb6694fb113066fd0ec7ea4d67e3a35e1f4cbeea0b448bf5872", size = 44631, upload-time = "2026-01-26T02:43:03.169Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/7f/0e3b1390ae772f27501199996b94b52ceeb64fe6f9120a32c6c3f6b781be/multidict-6.7.1-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:17207077e29342fdc2c9a82e4b306f1127bf1ea91f8b71e02d4798a70bb99991", size = 242561, upload-time = "2026-01-26T02:43:04.733Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/f4/8719f4f167586af317b69dd3e90f913416c91ca610cac79a45c53f590312/multidict-6.7.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4f49cb5661344764e4c7c7973e92a47a59b8fc19b6523649ec9dc4960e58a03", size = 242223, upload-time = "2026-01-26T02:43:06.695Z" },
+    { url = "https://files.pythonhosted.org/packages/47/ab/7c36164cce64a6ad19c6d9a85377b7178ecf3b89f8fd589c73381a5eedfd/multidict-6.7.1-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:a9fc4caa29e2e6ae408d1c450ac8bf19892c5fca83ee634ecd88a53332c59981", size = 222322, upload-time = "2026-01-26T02:43:08.472Z" },
+    { url = "https://files.pythonhosted.org/packages/f5/79/a25add6fb38035b5337bc5734f296d9afc99163403bbcf56d4170f97eb62/multidict-6.7.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c5f0c21549ab432b57dcc82130f388d84ad8179824cc3f223d5e7cfbfd4143f6", size = 254005, upload-time = "2026-01-26T02:43:10.127Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/7b/64a87cf98e12f756fc8bd444b001232ffff2be37288f018ad0d3f0aae931/multidict-6.7.1-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7dfb78d966b2c906ae1d28ccf6e6712a3cd04407ee5088cd276fe8cb42186190", size = 251173, upload-time = "2026-01-26T02:43:11.731Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/ac/b605473de2bb404e742f2cc3583d12aedb2352a70e49ae8fce455b50c5aa/multidict-6.7.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b0d9b91d1aa44db9c1f1ecd0d9d2ae610b2f4f856448664e01a3b35899f3f92", size = 243273, upload-time = "2026-01-26T02:43:13.063Z" },
+    { url = "https://files.pythonhosted.org/packages/03/65/11492d6a0e259783720f3bc1d9ea55579a76f1407e31ed44045c99542004/multidict-6.7.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dd96c01a9dcd4889dcfcf9eb5544ca0c77603f239e3ffab0524ec17aea9a93ee", size = 238956, upload-time = "2026-01-26T02:43:14.843Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/a7/7ee591302af64e7c196fb63fe856c788993c1372df765102bd0448e7e165/multidict-6.7.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:067343c68cd6612d375710f895337b3a98a033c94f14b9a99eff902f205424e2", size = 233477, upload-time = "2026-01-26T02:43:16.025Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/99/c109962d58756c35fd9992fed7f2355303846ea2ff054bb5f5e9d6b888de/multidict-6.7.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5884a04f4ff56c6120f6ccf703bdeb8b5079d808ba604d4d53aec0d55dc33568", size = 243615, upload-time = "2026-01-26T02:43:17.84Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/5f/1973e7c771c86e93dcfe1c9cc55a5481b610f6614acfc28c0d326fe6bfad/multidict-6.7.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8affcf1c98b82bc901702eb73b6947a1bfa170823c153fe8a47b5f5f02e48e40", size = 249930, upload-time = "2026-01-26T02:43:19.06Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/a5/f170fc2268c3243853580203378cd522446b2df632061e0a5409817854c7/multidict-6.7.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:0d17522c37d03e85c8098ec8431636309b2682cf12e58f4dbc76121fb50e4962", size = 243807, upload-time = "2026-01-26T02:43:20.286Z" },
+    { url = "https://files.pythonhosted.org/packages/de/01/73856fab6d125e5bc652c3986b90e8699a95e84b48d72f39ade6c0e74a8c/multidict-6.7.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:24c0cf81544ca5e17cfcb6e482e7a82cd475925242b308b890c9452a074d4505", size = 239103, upload-time = "2026-01-26T02:43:21.508Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/46/f1220bd9944d8aa40d8ccff100eeeee19b505b857b6f603d6078cb5315b0/multidict-6.7.1-cp310-cp310-win32.whl", hash = "sha256:d82dd730a95e6643802f4454b8fdecdf08667881a9c5670db85bc5a56693f122", size = 41416, upload-time = "2026-01-26T02:43:22.703Z" },
+    { url = "https://files.pythonhosted.org/packages/68/00/9b38e272a770303692fc406c36e1a4c740f401522d5787691eb38a8925a8/multidict-6.7.1-cp310-cp310-win_amd64.whl", hash = "sha256:cf37cbe5ced48d417ba045aca1b21bafca67489452debcde94778a576666a1df", size = 46022, upload-time = "2026-01-26T02:43:23.77Z" },
+    { url = "https://files.pythonhosted.org/packages/64/65/d8d42490c02ee07b6bbe00f7190d70bb4738b3cce7629aaf9f213ef730dd/multidict-6.7.1-cp310-cp310-win_arm64.whl", hash = "sha256:59bc83d3f66b41dac1e7460aac1d196edc70c9ba3094965c467715a70ecb46db", size = 43238, upload-time = "2026-01-26T02:43:24.882Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/f1/a90635c4f88fb913fbf4ce660b83b7445b7a02615bda034b2f8eb38fd597/multidict-6.7.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:7ff981b266af91d7b4b3793ca3382e53229088d193a85dfad6f5f4c27fc73e5d", size = 76626, upload-time = "2026-01-26T02:43:26.485Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/9b/267e64eaf6fc637a15b35f5de31a566634a2740f97d8d094a69d34f524a4/multidict-6.7.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:844c5bca0b5444adb44a623fb0a1310c2f4cd41f402126bb269cd44c9b3f3e1e", size = 44706, upload-time = "2026-01-26T02:43:27.607Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/a4/d45caf2b97b035c57267791ecfaafbd59c68212004b3842830954bb4b02e/multidict-6.7.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f2a0a924d4c2e9afcd7ec64f9de35fcd96915149b2216e1cb2c10a56df483855", size = 44356, upload-time = "2026-01-26T02:43:28.661Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/d2/0a36c8473f0cbaeadd5db6c8b72d15bbceeec275807772bfcd059bef487d/multidict-6.7.1-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8be1802715a8e892c784c0197c2ace276ea52702a0ede98b6310c8f255a5afb3", size = 244355, upload-time = "2026-01-26T02:43:31.165Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/16/8c65be997fd7dd311b7d39c7b6e71a0cb449bad093761481eccbbe4b42a2/multidict-6.7.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2e2d2ed645ea29f31c4c7ea1552fcfd7cb7ba656e1eafd4134a6620c9f5fdd9e", size = 246433, upload-time = "2026-01-26T02:43:32.581Z" },
+    { url = "https://files.pythonhosted.org/packages/01/fb/4dbd7e848d2799c6a026ec88ad39cf2b8416aa167fcc903baa55ecaa045c/multidict-6.7.1-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:95922cee9a778659e91db6497596435777bd25ed116701a4c034f8e46544955a", size = 225376, upload-time = "2026-01-26T02:43:34.417Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/8a/4a3a6341eac3830f6053062f8fbc9a9e54407c80755b3f05bc427295c2d0/multidict-6.7.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6b83cabdc375ffaaa15edd97eb7c0c672ad788e2687004990074d7d6c9b140c8", size = 257365, upload-time = "2026-01-26T02:43:35.741Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/a2/dd575a69c1aa206e12d27d0770cdf9b92434b48a9ef0cd0d1afdecaa93c4/multidict-6.7.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:38fb49540705369bab8484db0689d86c0a33a0a9f2c1b197f506b71b4b6c19b0", size = 254747, upload-time = "2026-01-26T02:43:36.976Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/56/21b27c560c13822ed93133f08aa6372c53a8e067f11fbed37b4adcdac922/multidict-6.7.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:439cbebd499f92e9aa6793016a8acaa161dfa749ae86d20960189f5398a19144", size = 246293, upload-time = "2026-01-26T02:43:38.258Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/a4/23466059dc3854763423d0ad6c0f3683a379d97673b1b89ec33826e46728/multidict-6.7.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6d3bc717b6fe763b8be3f2bee2701d3c8eb1b2a8ae9f60910f1b2860c82b6c49", size = 242962, upload-time = "2026-01-26T02:43:40.034Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/67/51dd754a3524d685958001e8fa20a0f5f90a6a856e0a9dcabff69be3dbb7/multidict-6.7.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:619e5a1ac57986dbfec9f0b301d865dddf763696435e2962f6d9cf2fdff2bb71", size = 237360, upload-time = "2026-01-26T02:43:41.752Z" },
+    { url = "https://files.pythonhosted.org/packages/64/3f/036dfc8c174934d4b55d86ff4f978e558b0e585cef70cfc1ad01adc6bf18/multidict-6.7.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:0b38ebffd9be37c1170d33bc0f36f4f262e0a09bc1aac1c34c7aa51a7293f0b3", size = 245940, upload-time = "2026-01-26T02:43:43.042Z" },
+    { url = "https://files.pythonhosted.org/packages/3d/20/6214d3c105928ebc353a1c644a6ef1408bc5794fcb4f170bb524a3c16311/multidict-6.7.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:10ae39c9cfe6adedcdb764f5e8411d4a92b055e35573a2eaa88d3323289ef93c", size = 253502, upload-time = "2026-01-26T02:43:44.371Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/e2/c653bc4ae1be70a0f836b82172d643fcf1dade042ba2676ab08ec08bff0f/multidict-6.7.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:25167cc263257660290fba06b9318d2026e3c910be240a146e1f66dd114af2b0", size = 247065, upload-time = "2026-01-26T02:43:45.745Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/11/a854b4154cd3bd8b1fd375e8a8ca9d73be37610c361543d56f764109509b/multidict-6.7.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:128441d052254f42989ef98b7b6a6ecb1e6f708aa962c7984235316db59f50fa", size = 241870, upload-time = "2026-01-26T02:43:47.054Z" },
+    { url = "https://files.pythonhosted.org/packages/13/bf/9676c0392309b5fdae322333d22a829715b570edb9baa8016a517b55b558/multidict-6.7.1-cp311-cp311-win32.whl", hash = "sha256:d62b7f64ffde3b99d06b707a280db04fb3855b55f5a06df387236051d0668f4a", size = 41302, upload-time = "2026-01-26T02:43:48.753Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/68/f16a3a8ba6f7b6dc92a1f19669c0810bd2c43fc5a02da13b1cbf8e253845/multidict-6.7.1-cp311-cp311-win_amd64.whl", hash = "sha256:bdbf9f3b332abd0cdb306e7c2113818ab1e922dc84b8f8fd06ec89ed2a19ab8b", size = 45981, upload-time = "2026-01-26T02:43:49.921Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/ad/9dd5305253fa00cd3c7555dbef69d5bf4133debc53b87ab8d6a44d411665/multidict-6.7.1-cp311-cp311-win_arm64.whl", hash = "sha256:b8c990b037d2fff2f4e33d3f21b9b531c5745b33a49a7d6dbe7a177266af44f6", size = 43159, upload-time = "2026-01-26T02:43:51.635Z" },
     { url = "https://files.pythonhosted.org/packages/8d/9c/f20e0e2cf80e4b2e4b1c365bf5fe104ee633c751a724246262db8f1a0b13/multidict-6.7.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a90f75c956e32891a4eda3639ce6dd86e87105271f43d43442a3aedf3cddf172", size = 76893, upload-time = "2026-01-26T02:43:52.754Z" },
     { url = "https://files.pythonhosted.org/packages/fe/cf/18ef143a81610136d3da8193da9d80bfe1cb548a1e2d1c775f26b23d024a/multidict-6.7.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:3fccb473e87eaa1382689053e4a4618e7ba7b9b9b8d6adf2027ee474597128cd", size = 45456, upload-time = "2026-01-26T02:43:53.893Z" },
     { url = "https://files.pythonhosted.org/packages/a9/65/1caac9d4cd32e8433908683446eebc953e82d22b03d10d41a5f0fefe991b/multidict-6.7.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0fa96985700739c4c7853a43c0b3e169360d6855780021bfc6d0f1ce7c123e7", size = 43872, upload-time = "2026-01-26T02:43:55.041Z" },
@@ -3185,6 +3799,12 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a2/f2/e783ac7f2aeeed14e9e12801f22529cc7e6b7ab80928d6dcce4e9f00922d/multiprocess-0.70.19.tar.gz", hash = "sha256:952021e0e6c55a4a9fe4cd787895b86e239a40e76802a789d6305398d3975897", size = 2079989, upload-time = "2026-01-19T06:47:39.744Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/b6/10832f96b499690854e574360be342a282f5f7dba58eff791299ff6c0637/multiprocess-0.70.19-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:02e5c35d7d6cd2bdc89c1858867f7bde4012837411023a4696c148c1bdd7c80e", size = 135131, upload-time = "2026-01-19T06:47:20.479Z" },
+    { url = "https://files.pythonhosted.org/packages/99/50/faef2d8106534b0dc4a0b772668a1a99682696ebf17d3c0f13f2ed6a656a/multiprocess-0.70.19-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:79576c02d1207ec405b00cabf2c643c36070800cca433860e14539df7818b2aa", size = 135131, upload-time = "2026-01-19T06:47:21.879Z" },
+    { url = "https://files.pythonhosted.org/packages/94/b1/0b71d18b76bf423c2e8ee00b31db37d17297ab3b4db44e188692afdca628/multiprocess-0.70.19-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:c6b6d78d43a03b68014ca1f0b7937d965393a670c5de7c29026beb2258f2f896", size = 135134, upload-time = "2026-01-19T06:47:23.262Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/aa/714635c727dbfc251139226fa4eaf1b07f00dc12d9cd2eb25f931adaf873/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:1bbf1b69af1cf64cd05f65337d9215b88079ec819cd0ea7bac4dab84e162efe7", size = 144743, upload-time = "2026-01-19T06:47:24.562Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/e1/155f6abf5e6b5d9cef29b6d0167c180846157a4aca9b9bee1a217f67c959/multiprocess-0.70.19-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:5be9ec7f0c1c49a4f4a6fd20d5dda4aeabc2d39a50f4ad53720f1cd02b3a7c2e", size = 144738, upload-time = "2026-01-19T06:47:26.636Z" },
+    { url = "https://files.pythonhosted.org/packages/af/cb/f421c2869d75750a4f32301cc20c4b63fab6376e9a75c8e5e655bdeb3d9b/multiprocess-0.70.19-pp311-pypy311_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:1c3dce098845a0db43b32a0b76a228ca059a668071cfeaa0f40c36c0b1585d45", size = 144741, upload-time = "2026-01-19T06:47:27.985Z" },
     { url = "https://files.pythonhosted.org/packages/e3/45/8004d1e6b9185c1a444d6b55ac5682acf9d98035e54386d967366035a03a/multiprocess-0.70.19-py310-none-any.whl", hash = "sha256:97404393419dcb2a8385910864eedf47a3cadf82c66345b44f036420eb0b5d87", size = 134948, upload-time = "2026-01-19T06:47:32.325Z" },
     { url = "https://files.pythonhosted.org/packages/86/c2/dec9722dc3474c164a0b6bcd9a7ed7da542c98af8cabce05374abab35edd/multiprocess-0.70.19-py311-none-any.whl", hash = "sha256:928851ae7973aea4ce0eaf330bbdafb2e01398a91518d5c8818802845564f45c", size = 144457, upload-time = "2026-01-19T06:47:33.711Z" },
     { url = "https://files.pythonhosted.org/packages/71/70/38998b950a97ea279e6bd657575d22d1a2047256caf707d9a10fbce4f065/multiprocess-0.70.19-py312-none-any.whl", hash = "sha256:3a56c0e85dd5025161bac5ce138dcac1e49174c7d8e74596537e729fd5c53c28", size = 150281, upload-time = "2026-01-19T06:47:35.037Z" },
@@ -3202,17 +3822,104 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/79/7b/2c79738432f5c924bef5071f933bcc9efd0473bac3b4aa584a6f7c1c8df8/mypy_extensions-1.1.0-py3-none-any.whl", hash = "sha256:1be4cccdb0f2482337c4743e60421de3a356cd97508abadd57d47403e94f5505", size = 4963, upload-time = "2025-04-22T14:54:22.983Z" },
 ]
 
+[[package]]
+name = "myst-parser"
+version = "4.0.1"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+dependencies = [
+    { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "markdown-it-py", version = "3.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "mdit-py-plugins", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pyyaml", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/66/a5/9626ba4f73555b3735ad86247a8077d4603aa8628537687c839ab08bfe44/myst_parser-4.0.1.tar.gz", hash = "sha256:5cfea715e4f3574138aecbf7d54132296bfd72bb614d31168f48c477a830a7c4", size = 93985, upload-time = "2025-02-12T10:53:03.833Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5f/df/76d0321c3797b54b60fef9ec3bd6f4cfd124b9e422182156a1dd418722cf/myst_parser-4.0.1-py3-none-any.whl", hash = "sha256:9134e88959ec3b5780aedf8a99680ea242869d012e8821db3126d427edc9c95d", size = 84579, upload-time = "2025-02-12T10:53:02.078Z" },
+]
+
 [[package]]
 name = "myst-parser"
 version = "5.0.0"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
 dependencies = [
-    { name = "docutils" },
-    { name = "jinja2" },
-    { name = "markdown-it-py" },
-    { name = "mdit-py-plugins" },
-    { name = "pyyaml" },
-    { name = "sphinx" },
+    { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "markdown-it-py", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "mdit-py-plugins", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pyyaml", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/33/fa/7b45eef11b7971f0beb29d27b7bfe0d747d063aa29e170d9edd004733c8a/myst_parser-5.0.0.tar.gz", hash = "sha256:f6f231452c56e8baa662cc352c548158f6a16fcbd6e3800fc594978002b94f3a", size = 98535, upload-time = "2026-01-15T09:08:18.036Z" }
 wheels = [
@@ -3230,20 +3937,98 @@ dependencies = [
     { name = "inquirerpy" },
     { name = "jinja2" },
     { name = "leptonai" },
-    { name = "networkx" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "omegaconf" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "packaging" },
     { name = "rich" },
     { name = "toml" },
     { name = "torchx" },
     { name = "typer" },
 ]
 
+[[package]]
+name = "networkx"
+version = "3.4.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+sdist = { url = "https://files.pythonhosted.org/packages/fd/1d/06475e1cd5264c0b870ea2cc6fdb3e37177c1e565c43f56ff17a10e3937f/networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1", size = 2151368, upload-time = "2024-10-21T12:39:38.695Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b9/54/dd730b32ea14ea797530a4479b2ed46a6fb250f682a9cfb997e968bf0261/networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f", size = 1723263, upload-time = "2024-10-21T12:39:36.247Z" },
+]
+
 [[package]]
 name = "networkx"
 version = "3.6.1"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
 sdist = { url = "https://files.pythonhosted.org/packages/6a/51/63fe664f3908c97be9d2e4f1158eb633317598cfa6e1fc14af5383f17512/networkx-3.6.1.tar.gz", hash = "sha256:26b7c357accc0c8cde558ad486283728b65b6a95d85ee1cd66bafab4c8168509", size = 2517025, upload-time = "2025-12-08T17:02:39.908Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/9e/c9/b2622292ea83fbb4ec318f5b9ab867d0a28ab43c5717bb85b0a5f6b3b0a4/networkx-3.6.1-py3-none-any.whl", hash = "sha256:d47fbf302e7d9cbbb9e2555a0d267983d2aa476bac30e90dfbe5669bd57f3762", size = 2068504, upload-time = "2025-12-08T17:02:38.159Z" },
@@ -3329,27 +4114,58 @@ name = "numpy"
 version = "2.0.2"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a9/75/10dd1f8116a8b796cb2c737b674e02d02e80454bda953fa7e65d8c12b016/numpy-2.0.2.tar.gz", hash = "sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78", size = 18902015, upload-time = "2024-08-26T20:19:40.945Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/21/91/3495b3237510f79f5d81f2508f9f13fea78ebfdf07538fc7444badda173d/numpy-2.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece", size = 21165245, upload-time = "2024-08-26T20:04:14.625Z" },
+    { url = "https://files.pythonhosted.org/packages/05/33/26178c7d437a87082d11019292dce6d3fe6f0e9026b7b2309cbf3e489b1d/numpy-2.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04", size = 13738540, upload-time = "2024-08-26T20:04:36.784Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/31/cc46e13bf07644efc7a4bf68df2df5fb2a1a88d0cd0da9ddc84dc0033e51/numpy-2.0.2-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66", size = 5300623, upload-time = "2024-08-26T20:04:46.491Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/16/7bfcebf27bb4f9d7ec67332ffebee4d1bf085c84246552d52dbb548600e7/numpy-2.0.2-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b", size = 6901774, upload-time = "2024-08-26T20:04:58.173Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/a3/561c531c0e8bf082c5bef509d00d56f82e0ea7e1e3e3a7fc8fa78742a6e5/numpy-2.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd", size = 13907081, upload-time = "2024-08-26T20:05:19.098Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/66/f7177ab331876200ac7563a580140643d1179c8b4b6a6b0fc9838de2a9b8/numpy-2.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318", size = 19523451, upload-time = "2024-08-26T20:05:47.479Z" },
+    { url = "https://files.pythonhosted.org/packages/25/7f/0b209498009ad6453e4efc2c65bcdf0ae08a182b2b7877d7ab38a92dc542/numpy-2.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8", size = 19927572, upload-time = "2024-08-26T20:06:17.137Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/df/2619393b1e1b565cd2d4c4403bdd979621e2c4dea1f8532754b2598ed63b/numpy-2.0.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326", size = 14400722, upload-time = "2024-08-26T20:06:39.16Z" },
+    { url = "https://files.pythonhosted.org/packages/22/ad/77e921b9f256d5da36424ffb711ae79ca3f451ff8489eeca544d0701d74a/numpy-2.0.2-cp310-cp310-win32.whl", hash = "sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97", size = 6472170, upload-time = "2024-08-26T20:06:50.361Z" },
+    { url = "https://files.pythonhosted.org/packages/10/05/3442317535028bc29cf0c0dd4c191a4481e8376e9f0db6bcf29703cadae6/numpy-2.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131", size = 15905558, upload-time = "2024-08-26T20:07:13.881Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/cf/034500fb83041aa0286e0fb16e7c76e5c8b67c0711bb6e9e9737a717d5fe/numpy-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448", size = 21169137, upload-time = "2024-08-26T20:07:45.345Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/d9/32de45561811a4b87fbdee23b5797394e3d1504b4a7cf40c10199848893e/numpy-2.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195", size = 13703552, upload-time = "2024-08-26T20:08:06.666Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/ca/2f384720020c7b244d22508cb7ab23d95f179fcfff33c31a6eeba8d6c512/numpy-2.0.2-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57", size = 5298957, upload-time = "2024-08-26T20:08:15.83Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/78/a3e4f9fb6aa4e6fdca0c5428e8ba039408514388cf62d89651aade838269/numpy-2.0.2-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a", size = 6905573, upload-time = "2024-08-26T20:08:27.185Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/72/cfc3a1beb2caf4efc9d0b38a15fe34025230da27e1c08cc2eb9bfb1c7231/numpy-2.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669", size = 13914330, upload-time = "2024-08-26T20:08:48.058Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/a8/c17acf65a931ce551fee11b72e8de63bf7e8a6f0e21add4c937c83563538/numpy-2.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951", size = 19534895, upload-time = "2024-08-26T20:09:16.536Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/86/8767f3d54f6ae0165749f84648da9dcc8cd78ab65d415494962c86fac80f/numpy-2.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9", size = 19937253, upload-time = "2024-08-26T20:09:46.263Z" },
+    { url = "https://files.pythonhosted.org/packages/df/87/f76450e6e1c14e5bb1eae6836478b1028e096fd02e85c1c37674606ab752/numpy-2.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15", size = 14414074, upload-time = "2024-08-26T20:10:08.483Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/ca/0f0f328e1e59f73754f06e1adfb909de43726d4f24c6a3f8805f34f2b0fa/numpy-2.0.2-cp311-cp311-win32.whl", hash = "sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4", size = 6470640, upload-time = "2024-08-26T20:10:19.732Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/57/3a3f14d3a759dcf9bf6e9eda905794726b758819df4663f217d658a58695/numpy-2.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc", size = 15910230, upload-time = "2024-08-26T20:10:43.413Z" },
     { url = "https://files.pythonhosted.org/packages/45/40/2e117be60ec50d98fa08c2f8c48e09b3edea93cfcabd5a9ff6925d54b1c2/numpy-2.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b", size = 20895803, upload-time = "2024-08-26T20:11:13.916Z" },
     { url = "https://files.pythonhosted.org/packages/46/92/1b8b8dee833f53cef3e0a3f69b2374467789e0bb7399689582314df02651/numpy-2.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e", size = 13471835, upload-time = "2024-08-26T20:11:34.779Z" },
     { url = "https://files.pythonhosted.org/packages/7f/19/e2793bde475f1edaea6945be141aef6c8b4c669b90c90a300a8954d08f0a/numpy-2.0.2-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c", size = 5038499, upload-time = "2024-08-26T20:11:43.902Z" },
@@ -3375,28 +4191,48 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587, upload-time = "2026-03-29T13:22:01.298Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/ef/c6/4218570d8c8ecc9704b5157a3348e486e84ef4be0ed3e38218ab473c83d2/numpy-2.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f983334aea213c99992053ede6168500e5f086ce74fbc4acc3f2b00f5762e9db", size = 16976799, upload-time = "2026-03-29T13:18:15.438Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/92/b4d922c4a5f5dab9ed44e6153908a5c665b71acf183a83b93b690996e39b/numpy-2.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:72944b19f2324114e9dc86a159787333b77874143efcf89a5167ef83cfee8af0", size = 14971552, upload-time = "2026-03-29T13:18:18.606Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/dc/df98c095978fa6ee7b9a9387d1d58cbb3d232d0e69ad169a4ce784bde4fd/numpy-2.4.4-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:86b6f55f5a352b48d7fbfd2dbc3d5b780b2d79f4d3c121f33eb6efb22e9a2015", size = 5476566, upload-time = "2026-03-29T13:18:21.532Z" },
+    { url = "https://files.pythonhosted.org/packages/28/34/b3fdcec6e725409223dd27356bdf5a3c2cc2282e428218ecc9cb7acc9763/numpy-2.4.4-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:ba1f4fc670ed79f876f70082eff4f9583c15fb9a4b89d6188412de4d18ae2f40", size = 6806482, upload-time = "2026-03-29T13:18:23.634Z" },
+    { url = "https://files.pythonhosted.org/packages/68/62/63417c13aa35d57bee1337c67446761dc25ea6543130cf868eace6e8157b/numpy-2.4.4-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a87ec22c87be071b6bdbd27920b129b94f2fc964358ce38f3822635a3e2e03d", size = 15973376, upload-time = "2026-03-29T13:18:26.677Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/c5/9fcb7e0e69cef59cf10c746b84f7d58b08bc66a6b7d459783c5a4f6101a6/numpy-2.4.4-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:df3775294accfdd75f32c74ae39fcba920c9a378a2fc18a12b6820aa8c1fb502", size = 16925137, upload-time = "2026-03-29T13:18:30.14Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/43/80020edacb3f84b9efdd1591120a4296462c23fd8db0dde1666f6ef66f13/numpy-2.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0d4e437e295f18ec29bc79daf55e8a47a9113df44d66f702f02a293d93a2d6dd", size = 17329414, upload-time = "2026-03-29T13:18:33.733Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/06/af0658593b18a5f73532d377188b964f239eb0894e664a6c12f484472f97/numpy-2.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6aa3236c78803afbcb255045fbef97a9e25a1f6c9888357d205ddc42f4d6eba5", size = 18658397, upload-time = "2026-03-29T13:18:37.511Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/ce/13a09ed65f5d0ce5c7dd0669250374c6e379910f97af2c08c57b0608eee4/numpy-2.4.4-cp311-cp311-win32.whl", hash = "sha256:30caa73029a225b2d40d9fae193e008e24b2026b7ee1a867b7ee8d96ca1a448e", size = 6239499, upload-time = "2026-03-29T13:18:40.372Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/63/05d193dbb4b5eec1eca73822d80da98b511f8328ad4ae3ca4caf0f4db91d/numpy-2.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:6bbe4eb67390b0a0265a2c25458f6b90a409d5d069f1041e6aff1e27e3d9a79e", size = 12614257, upload-time = "2026-03-29T13:18:42.95Z" },
+    { url = "https://files.pythonhosted.org/packages/87/c5/8168052f080c26fa984c413305012be54741c9d0d74abd7fbeeccae3889f/numpy-2.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:fcfe2045fd2e8f3cb0ce9d4ba6dba6333b8fa05bb8a4939c908cd43322d14c7e", size = 10486775, upload-time = "2026-03-29T13:18:45.835Z" },
     { url = "https://files.pythonhosted.org/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b", size = 16689272, upload-time = "2026-03-29T13:18:49.223Z" },
     { url = "https://files.pythonhosted.org/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e", size = 14699573, upload-time = "2026-03-29T13:18:52.629Z" },
     { url = "https://files.pythonhosted.org/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842", size = 5204782, upload-time = "2026-03-29T13:18:55.579Z" },
@@ -3450,7 +4286,26 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ed/ad/483d9e262f4b831000062e5d8a45e342166ec8aaa1195264982bca267e62/numpy-2.4.4-cp314-cp314t-win32.whl", hash = "sha256:dddbbd259598d7240b18c9d87c56a9d2fb3b02fe266f49a7c101532e78c1d871", size = 6155500, upload-time = "2026-03-29T13:21:28.205Z" },
     { url = "https://files.pythonhosted.org/packages/c7/03/2fc4e14c7bd4ff2964b74ba90ecb8552540b6315f201df70f137faa5c589/numpy-2.4.4-cp314-cp314t-win_amd64.whl", hash = "sha256:a7164afb23be6e37ad90b2f10426149fd75aee07ca55653d2aa41e66c4ef697e", size = 12637755, upload-time = "2026-03-29T13:21:31.107Z" },
     { url = "https://files.pythonhosted.org/packages/58/78/548fb8e07b1a341746bfbecb32f2c268470f45fa028aacdbd10d9bc73aab/numpy-2.4.4-cp314-cp314t-win_arm64.whl", hash = "sha256:ba203255017337d39f89bdd58417f03c4426f12beed0440cfd933cb15f8669c7", size = 10566643, upload-time = "2026-03-29T13:21:34.339Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/33/8fae8f964a4f63ed528264ddf25d2b683d0b663e3cba26961eb838a7c1bd/numpy-2.4.4-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:58c8b5929fcb8287cbd6f0a3fae19c6e03a5c48402ae792962ac465224a629a4", size = 16854491, upload-time = "2026-03-29T13:21:38.03Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/d0/1aabee441380b981cf8cdda3ae7a46aa827d1b5a8cce84d14598bc94d6d9/numpy-2.4.4-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:eea7ac5d2dce4189771cedb559c738a71512768210dc4e4753b107a2048b3d0e", size = 14895830, upload-time = "2026-03-29T13:21:41.509Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/b8/aafb0d1065416894fccf4df6b49ef22b8db045187949545bced89c034b8e/numpy-2.4.4-pp311-pypy311_pp73-macosx_14_0_arm64.whl", hash = "sha256:51fc224f7ca4d92656d5a5eb315f12eb5fe2c97a66249aa7b5f562528a3be38c", size = 5400927, upload-time = "2026-03-29T13:21:44.747Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/77/063baa20b08b431038c7f9ff5435540c7b7265c78cf56012a483019ca72d/numpy-2.4.4-pp311-pypy311_pp73-macosx_14_0_x86_64.whl", hash = "sha256:28a650663f7314afc3e6ec620f44f333c386aad9f6fc472030865dc0ebb26ee3", size = 6715557, upload-time = "2026-03-29T13:21:47.406Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/a8/379542d45a14f149444c5c4c4e7714707239ce9cc1de8c2803958889da14/numpy-2.4.4-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:19710a9ca9992d7174e9c52f643d4272dcd1558c5f7af7f6f8190f633bd651a7", size = 15804253, upload-time = "2026-03-29T13:21:50.753Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/c8/f0a45426d6d21e7ea3310a15cf90c43a14d9232c31a837702dba437f3373/numpy-2.4.4-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9b2aec6af35c113b05695ebb5749a787acd63cafc83086a05771d1e1cd1e555f", size = 16753552, upload-time = "2026-03-29T13:21:54.344Z" },
+    { url = "https://files.pythonhosted.org/packages/04/74/f4c001f4714c3ad9ce037e18cf2b9c64871a84951eaa0baf683a9ca9301c/numpy-2.4.4-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:f2cf083b324a467e1ab358c105f6cad5ea950f50524668a80c486ff1db24e119", size = 12509075, upload-time = "2026-03-29T13:21:57.644Z" },
+]
+
+[[package]]
+name = "nv-grouped-gemm"
+version = "1.1.4.post8"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "absl-py" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "torch", marker = "sys_platform == 'never'" },
 ]
+sdist = { url = "https://files.pythonhosted.org/packages/02/ad/046a097b63a96c1ba1d85f0031dbe7fcbdb33e6c445dfbaba2ffaefdd497/nv_grouped_gemm-1.1.4.post8.tar.gz", hash = "sha256:ab321693f0292cfd8a26dc7b6f14decd9eb00e209494de7218e4fad36191275d", size = 20821209, upload-time = "2025-12-17T02:22:38.432Z" }
 
 [[package]]
 name = "nvdlfw-inspect"
@@ -3509,7 +4364,7 @@ name = "nvidia-cudnn-cu13"
 version = "9.19.0.56"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas" },
+    { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f1/84/26025437c1e6b61a707442184fa0c03d083b661adf3a3eecfd6d21677740/nvidia_cudnn_cu13-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:6ed29ffaee1176c612daf442e4dd6cfeb6a0caa43ddcbeb59da94953030b1be4", size = 433781201, upload-time = "2026-02-03T20:40:53.805Z" },
@@ -3519,18 +4374,24 @@ wheels = [
 
 [[package]]
 name = "nvidia-cudnn-frontend"
-version = "1.22.1"
+version = "1.22.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1a/3f/523fb08d9b7be15242ade6e2a641900d05c0e9cfffab8260de37a04ac0d2/nvidia_cudnn_frontend-1.22.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f64fb4e0a45b7a8bb126f91a71d8afc03facf14b82dade51744ca48cf20d2974", size = 2722597, upload-time = "2026-04-10T17:33:54.366Z" },
-    { url = "https://files.pythonhosted.org/packages/34/b7/35c87c334d553bd45809ec957b53f3d7dd13c5a407e853c9eea29fcc5b3c/nvidia_cudnn_frontend-1.22.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:933275df405053001888875ee75d2138b20dc4e8bf4057461b1c74ca68b0e270", size = 2863367, upload-time = "2026-04-10T17:29:22.838Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/42/af975c8937a4c331b1215a0b2bdd2a742d792c6f777f919fd70480d63762/nvidia_cudnn_frontend-1.22.1-cp312-cp312-win_amd64.whl", hash = "sha256:2da1c277f008ee64273a48a5cb8d07efbb6d6774fdc08bd889476cce93b2f69a", size = 2310595, upload-time = "2026-04-10T17:37:24.776Z" },
-    { url = "https://files.pythonhosted.org/packages/29/d3/d698b020ced27b75f1e29862f0bc26759da96fc743570a094632c0dd14a9/nvidia_cudnn_frontend-1.22.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1bc0a0ec8004998a56f222cef618243bbee779930cdf3fe1f4a7604b2b412388", size = 2722225, upload-time = "2026-04-10T17:34:42.315Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/04/b7b66e3a0a7b036aca0f9704b335e663609359d0e3bdd7097f6d5ccdb40a/nvidia_cudnn_frontend-1.22.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b5295f8018cd92119968d948d25b0d2d834afd552627b47450759880dfe32110", size = 2863434, upload-time = "2026-04-10T17:29:55.721Z" },
-    { url = "https://files.pythonhosted.org/packages/54/8c/e9da7bbdf197397d13bb418027951e6181d0bb74c70c648fd97376bc2ed7/nvidia_cudnn_frontend-1.22.1-cp313-cp313-win_amd64.whl", hash = "sha256:7ea7887facf23d5363159073b0080cc09185e73be16ae797831d89f09b96b0f4", size = 2310490, upload-time = "2026-04-10T17:37:47.625Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/5b/951432f82d0226cba869c600dbbf892af9eb5e867b9d40839d0e6c6c3a9c/nvidia_cudnn_frontend-1.22.1-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aecf48a08520002a92d8be8a7191cf8c674a87373823678f54a25305bb35e841", size = 2723269, upload-time = "2026-04-10T17:35:31.507Z" },
-    { url = "https://files.pythonhosted.org/packages/3d/ef/dea590a9e1b7bed616274a14ec688a3555266f8b01c73d9f6ad47ca136de/nvidia_cudnn_frontend-1.22.1-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fb83a3c0419e8258abebf4dbc44a68ad02bc1d63c932479b9644525beecea6b0", size = 2864429, upload-time = "2026-04-10T17:30:37.55Z" },
-    { url = "https://files.pythonhosted.org/packages/36/c7/74e38e48e11b1fd18e934edaa2e45bffc9af349d819f56283c24f576ed26/nvidia_cudnn_frontend-1.22.1-cp314-cp314-win_amd64.whl", hash = "sha256:7a3c3e60b7be3777323426bf7334755ea99c87ffcf4c92bc7ba36c3248393f39", size = 2311675, upload-time = "2026-04-10T17:38:09.635Z" },
+    { url = "https://files.pythonhosted.org/packages/40/7d/28ab9cb9119fc6a3a383d943448ab310fe787daf784869b167dc7269969f/nvidia_cudnn_frontend-1.22.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:dbd3100ae212dd1f4691f8c096fe3aded46491f9a6cb258bfb802d07ca1a88fc", size = 2670597, upload-time = "2026-04-03T02:27:56.886Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/b4/976996f1ab721bbcae4b7379652949ddcd41803817d4b65b9bd0d726aa60/nvidia_cudnn_frontend-1.22.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62bf9c8569caf4d9518dae0755507ad36a4e311726aa015fde104c38a1630f76", size = 2811815, upload-time = "2026-04-03T02:32:24.504Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/56/755412cf4ce5ad95bcb00be3144c8e1fa07cbbae073f31a7b75ddec96ca0/nvidia_cudnn_frontend-1.22.0-cp310-cp310-win_amd64.whl", hash = "sha256:22748b41049d02c029719467924ea20d928517dd8f35e204a390f97407298eb2", size = 2260435, upload-time = "2026-04-03T02:24:51.808Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/ff/e4955b6fdff929ddf04a1252facae6201b308e001c91c690e96f65c4e90a/nvidia_cudnn_frontend-1.22.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cdff54c945fbabf9da06fd64ded60cf1ec94d580474f5746786c0effd759fedc", size = 2672347, upload-time = "2026-04-03T02:28:51.106Z" },
+    { url = "https://files.pythonhosted.org/packages/52/27/62fc6e2cddff7d6396be3685342ceec1c12fe2ee50e6f31d270887ecb5ad/nvidia_cudnn_frontend-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bb50bd2758c6d47c6210451c5c1932ed16e7563d7629228f4cc97edc0e01d0c5", size = 2814387, upload-time = "2026-04-03T02:32:47.972Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/4f/de06583ec21313f31d8b83bc2164e88fc22f5b48d8eb5cb45490fcf7c262/nvidia_cudnn_frontend-1.22.0-cp311-cp311-win_amd64.whl", hash = "sha256:49f817377a19e10e4aafa5797cd68315739dfdb2fc6a67dd1052b64c805d24ec", size = 2261332, upload-time = "2026-04-03T02:25:17.241Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/f1/67681e585abd98f968298c771b72830ce984a90fd0d787098d2ea2ba55c7/nvidia_cudnn_frontend-1.22.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bc9c12891d5427ef49b72b26df2b7889d623086d77c9e33b021c2de417d3e4dc", size = 2673215, upload-time = "2026-04-03T02:29:41.421Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/46/95b7779a2f71dfccce1783cc5ac210dda0124b93f8bf66cf62ed3d9ce0a5/nvidia_cudnn_frontend-1.22.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:98ffa05699d71795372f112fa2361c13be716fa3fda911c1e809903163ea5d11", size = 2815106, upload-time = "2026-04-03T02:33:11.473Z" },
+    { url = "https://files.pythonhosted.org/packages/61/47/522e84a37eedb1f680e74df449d39fe6f8641779523313d1a8522d449766/nvidia_cudnn_frontend-1.22.0-cp312-cp312-win_amd64.whl", hash = "sha256:81fde93d9b86ad631e17da1e2c103c4a7a541ec7abcb7f9a121cbd018c8eff26", size = 2262120, upload-time = "2026-04-03T02:25:40.18Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/93/43541b581207024824cb740f429bf882aaf3bde3633bd4099393dd9c0c16/nvidia_cudnn_frontend-1.22.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c9bdf48cf989b2a77f8b52623fc31c078362fd34389207d11cdb0b5624a7b311", size = 2673259, upload-time = "2026-04-03T02:30:30.634Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/5b/af9da5a455064380e68a441b9cfa1f1212dd6363bd02b5aa696d319bd211/nvidia_cudnn_frontend-1.22.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d02c4b4aae3e243ddb08ad4eb939988bcf7b1aefe25f5d400f6858c7276a6631", size = 2815032, upload-time = "2026-04-03T02:33:34.171Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/1d/3a15b719817ca6241e5f3a7a38608af21a3259e550a5dee5520e29adac00/nvidia_cudnn_frontend-1.22.0-cp313-cp313-win_amd64.whl", hash = "sha256:4906a38954725e35bc8431874f4d9db60d50e0d9dbc40ecaf8e5f40df545350b", size = 2262156, upload-time = "2026-04-03T02:26:03.322Z" },
+    { url = "https://files.pythonhosted.org/packages/27/ec/8c9b53a9174cca2d0062cbd8cb7c31403a38cb4c79984a9c554830cac5e9/nvidia_cudnn_frontend-1.22.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0f650058bda46a6542dfc3d021803021e7932e1cd6bb78cf46e81fa219717b5e", size = 2674887, upload-time = "2026-04-03T02:31:21.166Z" },
+    { url = "https://files.pythonhosted.org/packages/89/bd/3464d181ec2d94085cab98fd5ea4d312478aa6cb16ff38994a9188ac9f05/nvidia_cudnn_frontend-1.22.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:90f30b0d6563d050ca1972efa594a31d5affe5c3eeb467542e715d7ee73e3b5b", size = 2815841, upload-time = "2026-04-03T02:33:56.66Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/fd/bdec32a32b44f52b60a03f43e8619552ea0eb90a61de06632a054bf17d6a/nvidia_cudnn_frontend-1.22.0-cp314-cp314-win_amd64.whl", hash = "sha256:5994400a7f76a1be5e327a9ac1a4a635ee734d2ac8a5875e52481c52cf2b0922", size = 2263464, upload-time = "2026-04-03T02:26:26.553Z" },
 ]
 
 [[package]]
@@ -3538,7 +4399,7 @@ name = "nvidia-cufft"
 version = "12.0.0.61"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink" },
+    { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8b/ae/f417a75c0259e85c1d2f83ca4e960289a5f814ed0cea74d18c353d3e989d/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2708c852ef8cd89d1d2068bdbece0aa188813a0c934db3779b9b1faa8442e5f5", size = 214053554, upload-time = "2025-09-04T08:31:38.196Z" },
@@ -3570,9 +4431,9 @@ name = "nvidia-cusolver"
 version = "12.0.4.66"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas" },
-    { name = "nvidia-cusparse" },
-    { name = "nvidia-nvjitlink" },
+    { name = "nvidia-cublas", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
+    { name = "nvidia-cusparse", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
+    { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/c8/c3/b30c9e935fc01e3da443ec0116ed1b2a009bb867f5324d3f2d7e533e776b/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:02c2457eaa9e39de20f880f4bd8820e6a1cfb9f9a34f820eb12a155aa5bc92d2", size = 223467760, upload-time = "2025-09-04T08:33:04.222Z" },
@@ -3585,7 +4446,7 @@ name = "nvidia-cusparse"
 version = "12.6.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink" },
+    { name = "nvidia-nvjitlink", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f8/94/5c26f33738ae35276672f12615a64bd008ed5be6d1ebcb23579285d960a9/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:80bcc4662f23f1054ee334a15c72b8940402975e0eab63178fc7e670aa59472c", size = 162155568, upload-time = "2025-09-04T08:33:42.864Z" },
@@ -3620,11 +4481,15 @@ version = "4.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cuda-python" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "typing-extensions" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/5f/07/af1b456b5b6dd4a49e71a952a182a99fc863f70b9f78725324f89e0384e5/nvidia_cutlass_dsl_libs_base-4.4.2-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:06acb3acff3dcf4bf6630476efac7de94de30b988ded4fa00b647bbcec4224ff", size = 75471025, upload-time = "2026-03-16T02:23:49.61Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/12/f0770811d2874af7e04623d3baa83c445c49f38c00c4e5d20e1daae54b5d/nvidia_cutlass_dsl_libs_base-4.4.2-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:916bf612fba5fbc5162e300fe18196e960dac2328c1c1360c0939d3be05c7c71", size = 74355272, upload-time = "2026-03-16T02:24:44.22Z" },
+    { url = "https://files.pythonhosted.org/packages/60/bf/b9d0fd1ba281b111c941d9616dd9f98a509d84bf35076e60fef27ec7abd6/nvidia_cutlass_dsl_libs_base-4.4.2-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:261832dafe7579dc83cd3816ab9ea845e3de3737d876c215f01fb4edff1f4473", size = 75476977, upload-time = "2026-03-16T02:26:40.932Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/23/86dda6d69a3fc29d0cde2a8b54c056ad69b73a6e5e230e18d906d2ec3b7c/nvidia_cutlass_dsl_libs_base-4.4.2-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:40c2352b2fcc80789a216cbeb9b2ee10c85c15de839cda8f5c1d18166b8249df", size = 74356100, upload-time = "2026-03-16T02:26:12.778Z" },
     { url = "https://files.pythonhosted.org/packages/8e/7d/0df5e38d11e52cc72095a14d6448bc1c5d0d4b00b069a1189ca417fb225b/nvidia_cutlass_dsl_libs_base-4.4.2-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:2ec8812eeadcbb6fe20bda2e295ed9c00653f8253b78e33cf0ab65a47b829e73", size = 75473821, upload-time = "2026-03-16T02:27:08.371Z" },
     { url = "https://files.pythonhosted.org/packages/56/98/e264964741d9cc9816625d9600d17a5249fd5cbd8c2d166fb0d0c34dfe5a/nvidia_cutlass_dsl_libs_base-4.4.2-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:22e37b58f7a6f2f43bba533c4df8a088012122e0b4e9a632eca23937adeafb39", size = 74355593, upload-time = "2026-03-16T02:25:11.762Z" },
     { url = "https://files.pythonhosted.org/packages/1b/c9/2f17950ee2deb4b5f6b82f8155515a21792fe296e81bb638f164d8e2ca9b/nvidia_cutlass_dsl_libs_base-4.4.2-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:b59a052cbfb9a25747d1b6d413615456bea38d1f377da085af07c0d86a4c8b39", size = 75477304, upload-time = "2026-03-16T02:27:35.645Z" },
@@ -3658,13 +4523,14 @@ dependencies = [
     { name = "ninja" },
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" } },
     { name = "nvidia-ml-py" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" } },
+    { name = "packaging" },
     { name = "pulp" },
     { name = "pydantic" },
     { name = "regex" },
     { name = "rich" },
     { name = "safetensors" },
-    { name = "scipy" },
+    { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "scipy", version = "1.17.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "torch", marker = "sys_platform == 'never'" },
     { name = "tqdm" },
 ]
@@ -3712,17 +4578,17 @@ wheels = [
 
 [[package]]
 name = "nvidia-resiliency-ext"
-version = "0.6.0.dev69+b2bb3d7"
-source = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=b2bb3d728a18795807d9f76c535e005a609a1b01#b2bb3d728a18795807d9f76c535e005a609a1b01" }
+version = "0.6.0.dev33+15a8515"
+source = { git = "https://github.com/NVIDIA/nvidia-resiliency-ext.git?rev=15a851565a4ce846c04431ecb0cf09903ab4837e#15a851565a4ce846c04431ecb0cf09903ab4837e" }
 dependencies = [
     { name = "defusedxml" },
     { name = "grpcio" },
     { name = "grpcio-tools" },
-    { name = "langchain-openai" },
+    { name = "langchain-nvidia-ai-endpoints" },
     { name = "logsage" },
     { name = "mcp" },
     { name = "nvidia-ml-py" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" } },
+    { name = "packaging" },
     { name = "protobuf" },
     { name = "psutil" },
     { name = "pyyaml" },
@@ -3736,7 +4602,9 @@ version = "0.0.9.post1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pydata-sphinx-theme" },
-    { name = "sphinx" },
+    { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8c/79/017fab2f7167a9a9795665f894d04f77aafceca80821b51589bb4b23ff5c/nvidia_sphinx_theme-0.0.9.post1-py3-none-any.whl", hash = "sha256:21ca60206dff2f380d7783d64bbaf71a5b9cacae53c7d0686f089c16b5a3d45a", size = 143816, upload-time = "2025-11-09T23:16:55.719Z" },
@@ -3748,6 +4616,12 @@ version = "0.2.15"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/92/dd/692765e87de30bae1522cdffaa0f2b52949658a92a0fa6d96b1a01eae9d2/nvtx-0.2.15.tar.gz", hash = "sha256:2287d3be05b85661deb386f878d1f536c2e532774aa9ec7a50c434942ed81ae5", size = 121230, upload-time = "2026-03-18T10:01:25.547Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/9d/ef/ea1e9d92afd07fdf2a2390e508f1d214e5ba890561d7849d6ca708534b9d/nvtx-0.2.15-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a4f50832fd90a1b480a9deef6e4cd48015b61869095b54dd1a7afe87b4138c6a", size = 768543, upload-time = "2026-03-18T10:07:21.819Z" },
+    { url = "https://files.pythonhosted.org/packages/32/8e/b42c05cf3cc43c51f21fdda6f7c4fe28a595c6d2bdb0cfbf0477dc5805f2/nvtx-0.2.15-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5f3362f0db4252514719326c9d5662b0f93d254659ba97b9c8dbe556286e0e3e", size = 771975, upload-time = "2026-03-18T10:12:23.772Z" },
+    { url = "https://files.pythonhosted.org/packages/60/77/fc000055b5bb1651cdd772f0fe1fd9a16c7773b28dfc5624eea331d1415d/nvtx-0.2.15-cp310-cp310-win_amd64.whl", hash = "sha256:d71f934e580d4572f382712b6da464ab69e4c212981506f781f927d5c6d935d6", size = 134503, upload-time = "2026-03-18T10:04:05.773Z" },
+    { url = "https://files.pythonhosted.org/packages/80/65/435d10b2041ee082c07d5aed129afd504012c8908796d695f10e66bcc716/nvtx-0.2.15-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:157b80ea9b4db6c8f47f8dbe2fa2e81e7a7f1445bb87f8268f43dec9210b78a1", size = 806443, upload-time = "2026-03-18T10:05:49.308Z" },
+    { url = "https://files.pythonhosted.org/packages/47/bc/be94576ba33af75bcc68a857daade64cb86481764d4fb0f36308b1f6fc85/nvtx-0.2.15-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:02bca69ee55e0be41eabf908de9dbcdd18e702c7f49f9aa63fd396ce684ff5d5", size = 808183, upload-time = "2026-03-18T10:11:16.262Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/7a/42109f1cfb1ff9913201cb2b804956a4f003db4c018c2522a3c8066b3a1c/nvtx-0.2.15-cp311-cp311-win_amd64.whl", hash = "sha256:dbe41f78f5a811bd4cdad0a237e5b41a4937d8c2c6c9abdd161091671a598bc0", size = 134631, upload-time = "2026-03-18T10:02:11.247Z" },
     { url = "https://files.pythonhosted.org/packages/c2/07/698355285a03a366ef63ea9762fc1feef3f9f25483e1655408f72d827090/nvtx-0.2.15-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2cc530cd0f1a2c14a3a7e683833db509888ac5ed4ead94e5c9e2c7317c6937a7", size = 807159, upload-time = "2026-03-18T10:09:49.232Z" },
     { url = "https://files.pythonhosted.org/packages/c0/d1/08f22448d83481408d663065764ba583df091a7de629ed38fc97e522f1af/nvtx-0.2.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3ca8030a6d197952318013dd1c12c22da1d4b9feb76ba72e0fcd449961183c2c", size = 806187, upload-time = "2026-03-18T10:13:32.972Z" },
     { url = "https://files.pythonhosted.org/packages/54/23/c97c39e3b7ba256aa343cb828ca0d1c8421f705ca84795658ecd14ca95ed/nvtx-0.2.15-cp312-cp312-win_amd64.whl", hash = "sha256:70a1e768964e0520b68ccabc4df391cc227537c45936a7eba6507bc65e617e00", size = 129178, upload-time = "2026-03-18T10:02:55.299Z" },
@@ -3804,6 +4678,17 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5b/bf/b0a63ee9f3759dcd177b28c6f2cb22f2aecc6d9b3efecaabc298883caa5f/onnx-1.19.0.tar.gz", hash = "sha256:aa3f70b60f54a29015e41639298ace06adf1dd6b023b9b30f1bca91bb0db9473", size = 11949859, upload-time = "2025-08-27T02:34:27.107Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/00/b3/8a6f3b05d18dffdc7c18839bd829587c826c8513f4bdbe21ddf37dacce50/onnx-1.19.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:e927d745939d590f164e43c5aec7338c5a75855a15130ee795f492fc3a0fa565", size = 18310869, upload-time = "2025-08-27T02:32:47.346Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/92/550d6155ab3f2c00e95add1726397c95b4b79d6eb4928d049ff591ad4c84/onnx-1.19.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:c6cdcb237c5c4202463bac50417c5a7f7092997a8469e8b7ffcd09f51de0f4a9", size = 18028144, upload-time = "2025-08-27T02:32:50.306Z" },
+    { url = "https://files.pythonhosted.org/packages/79/21/9bcc715ea6d9aab3f6c583bfc59504a14777e39e0591030e7345f4e40315/onnx-1.19.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ed0b85a33deacb65baffe6ca4ce91adf2bb906fa2dee3856c3c94e163d2eb563", size = 18200923, upload-time = "2025-08-27T02:32:54.325Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/90/3a6f0741ff22270e2f4b741f440ab68ba5525ebc94775cd6f2c01f531374/onnx-1.19.0-cp310-cp310-win32.whl", hash = "sha256:89a9cefe75547aec14a796352c2243e36793bbbcb642d8897118595ab0c2395b", size = 16332097, upload-time = "2025-08-27T02:32:56.997Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/4c/ef61d359865712803d488672607023d36bfcd21fa008d8dc1d6ee8e8b23c/onnx-1.19.0-cp310-cp310-win_amd64.whl", hash = "sha256:a16a82bfdf4738691c0a6eda5293928645ab8b180ab033df84080817660b5e66", size = 16451402, upload-time = "2025-08-27T02:33:00.534Z" },
+    { url = "https://files.pythonhosted.org/packages/db/5c/b959b17608cfb6ccf6359b39fe56a5b0b7d965b3d6e6a3c0add90812c36e/onnx-1.19.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:206f00c47b85b5c7af79671e3307147407991a17994c26974565aadc9e96e4e4", size = 18312580, upload-time = "2025-08-27T02:33:03.081Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/ee/ac052bbbc832abe0debb784c2c57f9582444fb5f51d63c2967fd04432444/onnx-1.19.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:4d7bee94abaac28988b50da675ae99ef8dd3ce16210d591fbd0b214a5930beb3", size = 18029165, upload-time = "2025-08-27T02:33:05.771Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/c9/8687ba0948d46fd61b04e3952af9237883bbf8f16d716e7ed27e688d73b8/onnx-1.19.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7730b96b68c0c354bbc7857961bb4909b9aaa171360a8e3708d0a4c749aaadeb", size = 18202125, upload-time = "2025-08-27T02:33:09.325Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/16/6249c013e81bd689f46f96c7236d7677f1af5dd9ef22746716b48f10e506/onnx-1.19.0-cp311-cp311-win32.whl", hash = "sha256:7cb7a3ad8059d1a0dfdc5e0a98f71837d82002e441f112825403b137227c2c97", size = 16332738, upload-time = "2025-08-27T02:33:12.448Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/28/34a1e2166e418c6a78e5c82e66f409d9da9317832f11c647f7d4e23846a6/onnx-1.19.0-cp311-cp311-win_amd64.whl", hash = "sha256:d75452a9be868bd30c3ef6aa5991df89bbfe53d0d90b2325c5e730fbd91fff85", size = 16452303, upload-time = "2025-08-27T02:33:15.176Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/b7/639664626e5ba8027860c4d2a639ee02b37e9c322215c921e9222513c3aa/onnx-1.19.0-cp311-cp311-win_arm64.whl", hash = "sha256:23c7959370d7b3236f821e609b0af7763cff7672a758e6c1fc877bac099e786b", size = 16425340, upload-time = "2025-08-27T02:33:17.78Z" },
     { url = "https://files.pythonhosted.org/packages/0d/94/f56f6ca5e2f921b28c0f0476705eab56486b279f04e1d568ed64c14e7764/onnx-1.19.0-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:61d94e6498ca636756f8f4ee2135708434601b2892b7c09536befb19bc8ca007", size = 18322331, upload-time = "2025-08-27T02:33:20.373Z" },
     { url = "https://files.pythonhosted.org/packages/c8/00/8cc3f3c40b54b28f96923380f57c9176872e475face726f7d7a78bd74098/onnx-1.19.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:224473354462f005bae985c72028aaa5c85ab11de1b71d55b06fdadd64a667dd", size = 18027513, upload-time = "2025-08-27T02:33:23.44Z" },
     { url = "https://files.pythonhosted.org/packages/61/90/17c4d2566fd0117a5e412688c9525f8950d467f477fbd574e6b32bc9cb8d/onnx-1.19.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae475c85c89bc4d1f16571006fd21a3e7c0e258dd2c091f6e8aafb083d1ed9b", size = 18202278, upload-time = "2025-08-27T02:33:26.103Z" },
@@ -3836,26 +4721,45 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
 ]
 dependencies = [
     { name = "ml-dtypes", version = "0.5.4", source = { registry = "https://pypi.org/simple" } },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "protobuf" },
     { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c5/93/942d2a0f6a70538eea042ce0445c8aefd46559ad153469986f29a743c01c/onnx-1.21.0.tar.gz", hash = "sha256:4d8b67d0aaec5864c87633188b91cc520877477ec0254eda122bef8be43cd764", size = 12074608, upload-time = "2026-03-27T21:33:36.118Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/a8/28/a14b1845bf9302c3a787221e8f37cde4e7f930e10d95a8e22dd910aeb41d/onnx-1.21.0-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:e0c21cc5c7a41d1a509828e2b14fe9c30e807c6df611ec0fd64a47b8d4b16abd", size = 17966899, upload-time = "2026-03-27T21:32:15.53Z" },
+    { url = "https://files.pythonhosted.org/packages/41/7b/788881bf022a4cfb7b0843782f88415ea51c805cee4a909dcf2e49bb8129/onnx-1.21.0-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e1931bfcc222a4c9da6475f2ffffb84b97ab3876041ec639171c11ce802bee6a", size = 17534297, upload-time = "2026-03-27T21:32:18.343Z" },
+    { url = "https://files.pythonhosted.org/packages/16/51/eb64d4f2ec6caa98909aab5fbcfa24be9c059081e804bbb0012cc549ef89/onnx-1.21.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9b56ad04039fac6b028c07e54afa1ec7f75dd340f65311f2c292e41ed7aa4d9", size = 17616697, upload-time = "2026-03-27T21:32:21Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/4e/6b1f7800dae3407dc850e7e59d591ed8c83e9b3401e4cd57a1f612e400c6/onnx-1.21.0-cp310-cp310-win32.whl", hash = "sha256:3abd09872523c7e0362d767e4e63bd7c6bac52a5e2c3edbf061061fe540e2027", size = 16288893, upload-time = "2026-03-27T21:32:23.864Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/a8/89273e581d3943e20314af19b1596ab4d763f9c2eb07d4eaf4fb0593219b/onnx-1.21.0-cp310-cp310-win_amd64.whl", hash = "sha256:f2c7c234c568402e10db74e33d787e4144e394ae2bcbbf11000fbfe2e017ad68", size = 16443416, upload-time = "2026-03-27T21:32:26.655Z" },
+    { url = "https://files.pythonhosted.org/packages/45/48/32e383aa6bc40b72a9fd419937aaa647078190c9bfccdc97b316d2dee687/onnx-1.21.0-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:2aca19949260875c14866fc77ea0bc37e4e809b24976108762843d328c92d3ce", size = 17968053, upload-time = "2026-03-27T21:32:29.558Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/26/5726e8df7d36e96bb3c679912d1a86af42f393d77aa17d6b98a97d4289ce/onnx-1.21.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:82aa6ab51144df07c58c4850cb78d4f1ae969d8c0bf657b28041796d49ba6974", size = 17534821, upload-time = "2026-03-27T21:32:32.351Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/2b/021dcd2dd50c3c71b7959d7368526da384a295c162fb4863f36057973f78/onnx-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c3185a232089335581fabb98fba4e86d3e8246b8140f2e406082438100ebda", size = 17616664, upload-time = "2026-03-27T21:32:34.921Z" },
+    { url = "https://files.pythonhosted.org/packages/12/00/afa32a46fa122a7ed42df1cfe8796922156a3725ba8fc581c4779c96e2fc/onnx-1.21.0-cp311-cp311-win32.whl", hash = "sha256:f53b3c15a3b539c16b99655c43c365622046d68c49b680c48eba4da2a4fb6f27", size = 16289035, upload-time = "2026-03-27T21:32:37.783Z" },
+    { url = "https://files.pythonhosted.org/packages/73/8d/483cc980a24d4c0131d0af06d0ff6a37fb08ae90a7848ece8cef645194f1/onnx-1.21.0-cp311-cp311-win_amd64.whl", hash = "sha256:5f78c411743db317a76e5d009f84f7e3d5380411a1567a868e82461a1e5c775d", size = 16443748, upload-time = "2026-03-27T21:32:40.337Z" },
+    { url = "https://files.pythonhosted.org/packages/38/78/9d06fd5aaaed1ec9cb8a3b70fbbf00c1bdc18db610771e96379f0ed58112/onnx-1.21.0-cp311-cp311-win_arm64.whl", hash = "sha256:ab6a488dabbb172eebc9f3b3e7ac68763f32b0c571626d4a5004608f866cc83d", size = 16406123, upload-time = "2026-03-27T21:32:45.159Z" },
     { url = "https://files.pythonhosted.org/packages/7d/ae/cb644ec84c25e63575d9d8790fdcc5d1a11d67d3f62f872edb35fa38d158/onnx-1.21.0-cp312-abi3-macosx_12_0_universal2.whl", hash = "sha256:fc2635400fe39ff37ebc4e75342cc54450eadadf39c540ff132c319bf4960095", size = 17965930, upload-time = "2026-03-27T21:32:48.089Z" },
     { url = "https://files.pythonhosted.org/packages/6f/b6/eeb5903586645ef8a49b4b7892580438741acc3df91d7a5bd0f3a59ea9cb/onnx-1.21.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9003d5206c01fa2ff4b46311566865d8e493e1a6998d4009ec6de39843f1b59b", size = 17531344, upload-time = "2026-03-27T21:32:50.837Z" },
     { url = "https://files.pythonhosted.org/packages/a7/00/4823f06357892d1e60d6f34e7299d2ba4ed2108c487cc394f7ce85a3ff14/onnx-1.21.0-cp312-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a9261bd580fb8548c9c37b3c6750387eb8f21ea43c63880d37b2c622e1684285", size = 17613697, upload-time = "2026-03-27T21:32:54.222Z" },
@@ -3916,21 +4820,29 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
 ]
 dependencies = [
     { name = "ml-dtypes", version = "0.5.4", source = { registry = "https://pypi.org/simple" } },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "onnx", version = "1.21.0", source = { registry = "https://pypi.org/simple" } },
     { name = "sympy" },
     { name = "typing-extensions" },
@@ -3963,7 +4875,7 @@ dependencies = [
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
     { name = "onnx", version = "1.19.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
     { name = "onnx-ir", version = "0.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
+    { name = "packaging", marker = "python_full_version >= '3.13'" },
     { name = "typing-extensions", marker = "python_full_version >= '3.13'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f5/2f/0bb2b6ca727e4d5173f640527f402ab4225def4bc8d667269b83047be8c4/onnxscript-0.5.0.tar.gz", hash = "sha256:4aba215e1f80fbcd07ba0d97d6bca96797fc3e9639eacb5434d35317ce1406aa", size = 588762, upload-time = "2025-09-12T16:57:46.484Z" }
@@ -3984,25 +4896,32 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
 ]
 dependencies = [
     { name = "ml-dtypes", version = "0.5.4", source = { registry = "https://pypi.org/simple" } },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "onnx", version = "1.21.0", source = { registry = "https://pypi.org/simple" } },
     { name = "onnx-ir", version = "0.2.0", source = { registry = "https://pypi.org/simple" } },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "packaging" },
     { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e7/2b/538fdeb0e25bed5d7e0f954af5710543e2629499fb74381afc3333f8a8ae/onnxscript-0.6.2.tar.gz", hash = "sha256:abb2e6f464db40c9b8c7fbb3e64cca04cf3f4495e67c4eda5eac17b784191ce3", size = 590865, upload-time = "2026-02-10T22:53:39.638Z" }
@@ -4012,7 +4931,7 @@ wheels = [
 
 [[package]]
 name = "openai"
-version = "2.32.0"
+version = "2.30.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "anyio" },
@@ -4024,9 +4943,9 @@ dependencies = [
     { name = "tqdm" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ed/59/bdcc6b759b8c42dd73afaf5bf8f902c04b37987a5514dbc1c64dba390fef/openai-2.32.0.tar.gz", hash = "sha256:c54b27a9e4cb8d51f0dd94972ffd1a04437efeb259a9e60d8922b8bd26fe55e0", size = 693286, upload-time = "2026-04-15T22:28:19.434Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/88/15/52580c8fbc16d0675d516e8749806eda679b16de1e4434ea06fb6feaa610/openai-2.30.0.tar.gz", hash = "sha256:92f7661c990bda4b22a941806c83eabe4896c3094465030dd882a71abe80c885", size = 676084, upload-time = "2026-03-25T22:08:59.96Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/1e/c1/d6e64ccd0536bf616556f0cad2b6d94a8125f508d25cfd814b1d2db4e2f1/openai-2.32.0-py3-none-any.whl", hash = "sha256:4dcc9badeb4bf54ad0d187453742f290226d30150890b7890711bda4f32f192f", size = 1162570, upload-time = "2026-04-15T22:28:17.714Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/9e/5bfa2270f902d5b92ab7d41ce0475b8630572e71e349b2a4996d14bdda93/openai-2.30.0-py3-none-any.whl", hash = "sha256:9a5ae616888eb2748ec5e0c5b955a51592e0b201a11f4262db920f2a78c5231d", size = 1146656, upload-time = "2026-03-25T22:08:58.2Z" },
 ]
 
 [package.optional-dependencies]
@@ -4071,16 +4990,24 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
 ]
 dependencies = [
     { name = "deprecated", marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
@@ -4100,11 +5027,15 @@ resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'emscripten'",
     "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11'",
 ]
 dependencies = [
     { name = "importlib-metadata", marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
@@ -4128,16 +5059,24 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
 ]
 dependencies = [
     { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
@@ -4158,11 +5097,15 @@ resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'emscripten'",
     "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11'",
 ]
 dependencies = [
     { name = "opentelemetry-api", version = "1.41.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
@@ -4176,14 +5119,14 @@ wheels = [
 
 [[package]]
 name = "opentelemetry-proto"
-version = "1.41.0"
+version = "1.40.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "protobuf" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e0/d9/08e3dc6156878713e8c811682bc76151f5fe1a3cb7f3abda3966fd56e71e/opentelemetry_proto-1.41.0.tar.gz", hash = "sha256:95d2e576f9fb1800473a3e4cfcca054295d06bdb869fda4dc9f4f779dc68f7b6", size = 45669, upload-time = "2026-04-09T14:38:45.978Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4c/77/dd38991db037fdfce45849491cb61de5ab000f49824a00230afb112a4392/opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd", size = 45667, upload-time = "2026-03-04T14:17:31.194Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/49/8c/65ef7a9383a363864772022e822b5d5c6988e6f9dabeebb9278f5b86ebc3/opentelemetry_proto-1.41.0-py3-none-any.whl", hash = "sha256:b970ab537309f9eed296be482c3e7cca05d8aca8165346e929f658dbe153b247", size = 72074, upload-time = "2026-04-09T14:38:29.38Z" },
+    { url = "https://files.pythonhosted.org/packages/b9/b2/189b2577dde745b15625b3214302605b1353436219d42b7912e77fa8dc24/opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f", size = 72073, upload-time = "2026-03-04T14:17:16.673Z" },
 ]
 
 [[package]]
@@ -4199,16 +5142,24 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
 ]
 dependencies = [
     { name = "opentelemetry-api", version = "1.33.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
@@ -4229,11 +5180,15 @@ resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'emscripten'",
     "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11'",
 ]
 dependencies = [
     { name = "opentelemetry-api", version = "1.41.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
@@ -4258,16 +5213,24 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
 ]
 dependencies = [
     { name = "deprecated", marker = "extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
@@ -4287,11 +5250,15 @@ resolution-markers = [
     "python_full_version >= '3.14' and sys_platform == 'emscripten'",
     "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version < '3.11'",
 ]
 dependencies = [
     { name = "opentelemetry-api", version = "1.41.0", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts')" },
@@ -4308,6 +5275,34 @@ version = "3.11.8"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/9d/1b/2024d06792d0779f9dbc51531b61c24f76c75b9f4ce05e6f3377a1814cea/orjson-3.11.8.tar.gz", hash = "sha256:96163d9cdc5a202703e9ad1b9ae757d5f0ca62f4fa0cc93d1f27b0e180cc404e", size = 5603832, upload-time = "2026-03-31T16:16:27.878Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/2f/90/5d81f61fe3e4270da80c71442864c091cee3003cc8984c75f413fe742a07/orjson-3.11.8-cp310-cp310-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:e6693ff90018600c72fd18d3d22fa438be26076cd3c823da5f63f7bab28c11cb", size = 229663, upload-time = "2026-03-31T16:14:30.708Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/ef/85e06b0eb11de6fb424120fd5788a07035bd4c5e6bb7841ae9972a0526d1/orjson-3.11.8-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93de06bc920854552493c81f1f729fab7213b7db4b8195355db5fda02c7d1363", size = 132321, upload-time = "2026-03-31T16:14:32.317Z" },
+    { url = "https://files.pythonhosted.org/packages/86/71/089338ee51b3132f050db0864a7df9bdd5e94c2a03820ab8a91e8f655618/orjson-3.11.8-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fe0b8c83e0f36247fc9431ce5425a5d95f9b3a689133d494831bdbd6f0bceb13", size = 130658, upload-time = "2026-03-31T16:14:33.935Z" },
+    { url = "https://files.pythonhosted.org/packages/10/0d/f39d8802345d0ad65f7fd4374b29b9b59f98656dc30f21ca5c773265b2f0/orjson-3.11.8-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:97d823831105c01f6c8029faf297633dbeb30271892bd430e9c24ceae3734744", size = 135708, upload-time = "2026-03-31T16:14:35.224Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/b5/40aae576b3473511696dcffea84fde638b2b64774eb4dcb8b2c262729f8a/orjson-3.11.8-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c60c0423f15abb6cf78f56dff00168a1b582f7a1c23f114036e2bfc697814d5f", size = 147047, upload-time = "2026-03-31T16:14:36.489Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/f0/778a84458d1fdaa634b2e572e51ce0b354232f580b2327e1f00a8d88c38c/orjson-3.11.8-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:01928d0476b216ad2201823b0a74000440360cef4fed1912d297b8d84718f277", size = 133072, upload-time = "2026-03-31T16:14:37.715Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/d3/1bbf2fc3ffcc4b829ade554b574af68cec898c9b5ad6420a923c75a073d3/orjson-3.11.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6a4a639049c44d36a6d1ae0f4a94b271605c745aee5647fa8ffaabcdc01b69a6", size = 133867, upload-time = "2026-03-31T16:14:39.356Z" },
+    { url = "https://files.pythonhosted.org/packages/08/94/6413da22edc99a69a8d0c2e83bf42973b8aa94d83ef52a6d39ac85da00bc/orjson-3.11.8-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3222adff1e1ff0dce93c16146b93063a7793de6c43d52309ae321234cdaf0f4d", size = 142268, upload-time = "2026-03-31T16:14:40.972Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/5f/aa5dbaa6136d7ba55f5461ac2e885efc6e6349424a428927fd46d68f4396/orjson-3.11.8-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:3223665349bbfb68da234acd9846955b1a0808cbe5520ff634bf253a4407009b", size = 424008, upload-time = "2026-03-31T16:14:42.637Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/aa/2c1962d108c7fe5e27aa03a354b378caf56d8eafdef15fd83dec081ce45a/orjson-3.11.8-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:61c9d357a59465736022d5d9ba06687afb7611dfb581a9d2129b77a6fcf78e59", size = 147942, upload-time = "2026-03-31T16:14:44.256Z" },
+    { url = "https://files.pythonhosted.org/packages/47/d1/65f404f4c47eb1b0b4476f03ec838cac0c4aa933920ff81e5dda4dee14e7/orjson-3.11.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:58fb9b17b4472c7b1dcf1a54583629e62e23779b2331052f09a9249edf81675b", size = 136640, upload-time = "2026-03-31T16:14:45.884Z" },
+    { url = "https://files.pythonhosted.org/packages/90/5f/7b784aea98bdb125a2f2da7c27d6c2d2f6d943d96ef0278bae596d563f85/orjson-3.11.8-cp310-cp310-win32.whl", hash = "sha256:b43dc2a391981d36c42fa57747a49dae793ef1d2e43898b197925b5534abd10a", size = 132066, upload-time = "2026-03-31T16:14:47.397Z" },
+    { url = "https://files.pythonhosted.org/packages/92/ec/2e284af8d6c9478df5ef938917743f61d68f4c70d17f1b6e82f7e3b8dba1/orjson-3.11.8-cp310-cp310-win_amd64.whl", hash = "sha256:c98121237fea2f679480765abd566f7713185897f35c9e6c2add7e3a9900eb61", size = 127609, upload-time = "2026-03-31T16:14:48.78Z" },
+    { url = "https://files.pythonhosted.org/packages/67/41/5aa7fa3b0f4dc6b47dcafc3cea909299c37e40e9972feabc8b6a74e2730d/orjson-3.11.8-cp311-cp311-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:003646067cc48b7fcab2ae0c562491c9b5d2cbd43f1e5f16d98fd118c5522d34", size = 229229, upload-time = "2026-03-31T16:14:50.424Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/d7/57e7f2458e0a2c41694f39fc830030a13053a84f837a5b73423dca1f0938/orjson-3.11.8-cp311-cp311-macosx_15_0_arm64.whl", hash = "sha256:ed193ce51d77a3830cad399a529cd4ef029968761f43ddc549e1bc62b40d88f8", size = 128871, upload-time = "2026-03-31T16:14:51.888Z" },
+    { url = "https://files.pythonhosted.org/packages/53/4a/e0fdb9430983e6c46e0299559275025075568aad5d21dd606faee3703924/orjson-3.11.8-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f30491bc4f862aa15744b9738517454f1e46e56c972a2be87d70d727d5b2a8f8", size = 132104, upload-time = "2026-03-31T16:14:53.142Z" },
+    { url = "https://files.pythonhosted.org/packages/08/4a/2025a60ff3f5c8522060cda46612d9b1efa653de66ed2908591d8d82f22d/orjson-3.11.8-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6eda5b8b6be91d3f26efb7dc6e5e68ee805bc5617f65a328587b35255f138bf4", size = 130483, upload-time = "2026-03-31T16:14:54.605Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/3c/b9cde05bdc7b2385c66014e0620627da638d3d04e4954416ab48c31196c5/orjson-3.11.8-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ee8db7bfb6fe03581bbab54d7c4124a6dd6a7f4273a38f7267197890f094675f", size = 135481, upload-time = "2026-03-31T16:14:55.901Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/f2/a8238e7734de7cb589fed319857a8025d509c89dc52fdcc88f39c6d03d5a/orjson-3.11.8-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d8b5231de76c528a46b57010bbd83fb51e056aa0220a372fd5065e978406f1c", size = 146819, upload-time = "2026-03-31T16:14:57.548Z" },
+    { url = "https://files.pythonhosted.org/packages/db/10/dbf1e2a3cafea673b1b4350e371877b759060d6018a998643b7040e5de48/orjson-3.11.8-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:58a4a208a6fbfdb7a7327b8f201c6014f189f721fd55d047cafc4157af1bc62a", size = 132846, upload-time = "2026-03-31T16:14:58.91Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/fc/55e667ec9c85694038fcff00573d221b085d50777368ee3d77f38668bf3c/orjson-3.11.8-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f8952d6d2505c003e8f0224ff7858d341fa4e33fef82b91c4ff0ef070f2393c", size = 133580, upload-time = "2026-03-31T16:15:00.519Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/a6/c08c589a9aad0cb46c4831d17de212a2b6901f9d976814321ff8e69e8785/orjson-3.11.8-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0022bb50f90da04b009ce32c512dc1885910daa7cb10b7b0cba4505b16db82a8", size = 142042, upload-time = "2026-03-31T16:15:01.906Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/cc/2f78ea241d52b717d2efc38878615fe80425bf2beb6e68c984dde257a766/orjson-3.11.8-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:ff51f9d657d1afb6f410cb435792ce4e1fe427aab23d2fcd727a2876e21d4cb6", size = 423845, upload-time = "2026-03-31T16:15:03.703Z" },
+    { url = "https://files.pythonhosted.org/packages/70/07/c17dcf05dd8045457538428a983bf1f1127928df5bf328cb24d2b7cddacb/orjson-3.11.8-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6dbe9a97bdb4d8d9d5367b52a7c32549bba70b2739c58ef74a6964a6d05ae054", size = 147729, upload-time = "2026-03-31T16:15:05.203Z" },
+    { url = "https://files.pythonhosted.org/packages/90/6c/0fb6e8a24e682e0958d71711ae6f39110e4b9cd8cab1357e2a89cb8e1951/orjson-3.11.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a5c370674ebabe16c6ccac33ff80c62bf8a6e59439f5e9d40c1f5ab8fd2215b7", size = 136425, upload-time = "2026-03-31T16:15:07.052Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/35/4d3cc3a3d616035beb51b24a09bb872942dc452cf2df0c1d11ab35046d9f/orjson-3.11.8-cp311-cp311-win32.whl", hash = "sha256:0e32f7154299f42ae66f13488963269e5eccb8d588a65bc839ed986919fc9fac", size = 131870, upload-time = "2026-03-31T16:15:08.678Z" },
+    { url = "https://files.pythonhosted.org/packages/13/26/9fe70f81d16b702f8c3a775e8731b50ad91d22dacd14c7599b60a0941cd1/orjson-3.11.8-cp311-cp311-win_amd64.whl", hash = "sha256:25e0c672a2e32348d2eb33057b41e754091f2835f87222e4675b796b92264f06", size = 127440, upload-time = "2026-03-31T16:15:09.994Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/c6/b038339f4145efd2859c1ca53097a52c0bb9cbdd24f947ebe146da1ad067/orjson-3.11.8-cp311-cp311-win_arm64.whl", hash = "sha256:9185589c1f2a944c17e26c9925dcdbc2df061cc4a145395c57f0c51f9b5dbfcd", size = 127399, upload-time = "2026-03-31T16:15:11.412Z" },
     { url = "https://files.pythonhosted.org/packages/01/f6/8d58b32ab32d9215973a1688aebd098252ee8af1766c0e4e36e7831f0295/orjson-3.11.8-cp312-cp312-macosx_10_15_x86_64.macosx_11_0_arm64.macosx_10_15_universal2.whl", hash = "sha256:1cd0b77e77c95758f8e1100139844e99f3ccc87e71e6fc8e1c027e55807c549f", size = 229233, upload-time = "2026-03-31T16:15:12.762Z" },
     { url = "https://files.pythonhosted.org/packages/a9/8b/2ffe35e71f6b92622e8ea4607bf33ecf7dfb51b3619dcfabfd36cbe2d0a5/orjson-3.11.8-cp312-cp312-macosx_15_0_arm64.whl", hash = "sha256:6a3d159d5ffa0e3961f353c4b036540996bf8b9697ccc38261c0eac1fd3347a6", size = 128772, upload-time = "2026-03-31T16:15:14.237Z" },
     { url = "https://files.pythonhosted.org/packages/27/d2/1f8682ae50d5c6897a563cb96bc106da8c9cb5b7b6e81a52e4cc086679b9/orjson-3.11.8-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76070a76e9c5ae661e2d9848f216980d8d533e0f8143e6ed462807b242e3c5e8", size = 131946, upload-time = "2026-03-31T16:15:15.607Z" },
@@ -4357,69 +5352,11 @@ wheels = [
 
 [[package]]
 name = "packaging"
-version = "25.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a1/d4/1fc4078c65507b51b96ca8f8c3ba19e6a61c8253c72794544580a7b6c24d/packaging-25.0.tar.gz", hash = "sha256:d443872c98d677bf60f6a1f2f8c1cb748e8fe762d2bf9d3148b5599295b0fc4f", size = 165727, upload-time = "2025-04-19T11:48:59.673Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/20/12/38679034af332785aac8774540895e234f4d07f7545804097de4b666afd8/packaging-25.0-py3-none-any.whl", hash = "sha256:29572ef2b1f17581046b3a2227d5c611fb25ec70ca1ba8554b24b0e69331a484", size = 66469, upload-time = "2025-04-19T11:48:57.875Z" },
-]
-
-[[package]]
-name = "packaging"
-version = "26.1"
+version = "26.0"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-]
-sdist = { url = "https://files.pythonhosted.org/packages/df/de/0d2b39fb4af88a0258f3bac87dfcbb48e73fbdea4a2ed0e2213f9a4c2f9a/packaging-26.1.tar.gz", hash = "sha256:f042152b681c4bfac5cae2742a55e103d27ab2ec0f3d88037136b6bfe7c9c5de", size = 215519, upload-time = "2026-04-14T21:12:49.362Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/65/ee/299d360cdc32edc7d2cf530f3accf79c4fca01e96ffc950d8a52213bd8e4/packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4", size = 143416, upload-time = "2026-01-21T20:50:39.064Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7a/c2/920ef838e2f0028c8262f16101ec09ebd5969864e5a64c4c05fad0617c56/packaging-26.1-py3-none-any.whl", hash = "sha256:5d9c0669c6285e491e0ced2eee587eaf67b670d94a19e94e3984a481aba6802f", size = 95831, upload-time = "2026-04-14T21:12:47.56Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/b9/c538f279a4e237a006a2c98387d081e9eb060d203d8ed34467cc0f0b9b53/packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529", size = 74366, upload-time = "2026-01-21T20:50:37.788Z" },
 ]
 
 [[package]]
@@ -4427,33 +5364,58 @@ name = "pandas"
 version = "2.3.3"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-]
-dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "python-dateutil", marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "pytz", marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "tzdata", marker = "extra == 'extra-13-megatron-core-dev'" },
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+    { name = "python-dateutil", marker = "python_full_version < '3.11'" },
+    { name = "pytz", marker = "python_full_version < '3.11'" },
+    { name = "tzdata", marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/33/01/d40b85317f86cf08d853a4f495195c73815fdf205eef3993821720274518/pandas-2.3.3.tar.gz", hash = "sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b", size = 4495223, upload-time = "2025-09-29T23:34:51.853Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/3d/f7/f425a00df4fcc22b292c6895c6831c0c8ae1d9fac1e024d16f98a9ce8749/pandas-2.3.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c", size = 11555763, upload-time = "2025-09-29T23:16:53.287Z" },
+    { url = "https://files.pythonhosted.org/packages/13/4f/66d99628ff8ce7857aca52fed8f0066ce209f96be2fede6cef9f84e8d04f/pandas-2.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a", size = 10801217, upload-time = "2025-09-29T23:17:04.522Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/03/3fc4a529a7710f890a239cc496fc6d50ad4a0995657dccc1d64695adb9f4/pandas-2.3.3-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1", size = 12148791, upload-time = "2025-09-29T23:17:18.444Z" },
+    { url = "https://files.pythonhosted.org/packages/40/a8/4dac1f8f8235e5d25b9955d02ff6f29396191d4e665d71122c3722ca83c5/pandas-2.3.3-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838", size = 12769373, upload-time = "2025-09-29T23:17:35.846Z" },
+    { url = "https://files.pythonhosted.org/packages/df/91/82cc5169b6b25440a7fc0ef3a694582418d875c8e3ebf796a6d6470aa578/pandas-2.3.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250", size = 13200444, upload-time = "2025-09-29T23:17:49.341Z" },
+    { url = "https://files.pythonhosted.org/packages/10/ae/89b3283800ab58f7af2952704078555fa60c807fff764395bb57ea0b0dbd/pandas-2.3.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4", size = 13858459, upload-time = "2025-09-29T23:18:03.722Z" },
+    { url = "https://files.pythonhosted.org/packages/85/72/530900610650f54a35a19476eca5104f38555afccda1aa11a92ee14cb21d/pandas-2.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826", size = 11346086, upload-time = "2025-09-29T23:18:18.505Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/fa/7ac648108144a095b4fb6aa3de1954689f7af60a14cf25583f4960ecb878/pandas-2.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523", size = 11578790, upload-time = "2025-09-29T23:18:30.065Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/35/74442388c6cf008882d4d4bdfc4109be87e9b8b7ccd097ad1e7f006e2e95/pandas-2.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45", size = 10833831, upload-time = "2025-09-29T23:38:56.071Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/e4/de154cbfeee13383ad58d23017da99390b91d73f8c11856f2095e813201b/pandas-2.3.3-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66", size = 12199267, upload-time = "2025-09-29T23:18:41.627Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/c9/63f8d545568d9ab91476b1818b4741f521646cbdd151c6efebf40d6de6f7/pandas-2.3.3-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b", size = 12789281, upload-time = "2025-09-29T23:18:56.834Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/00/a5ac8c7a0e67fd1a6059e40aa08fa1c52cc00709077d2300e210c3ce0322/pandas-2.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791", size = 13240453, upload-time = "2025-09-29T23:19:09.247Z" },
+    { url = "https://files.pythonhosted.org/packages/27/4d/5c23a5bc7bd209231618dd9e606ce076272c9bc4f12023a70e03a86b4067/pandas-2.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151", size = 13890361, upload-time = "2025-09-29T23:19:25.342Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/59/712db1d7040520de7a4965df15b774348980e6df45c129b8c64d0dbe74ef/pandas-2.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c", size = 11348702, upload-time = "2025-09-29T23:19:38.296Z" },
     { url = "https://files.pythonhosted.org/packages/9c/fb/231d89e8637c808b997d172b18e9d4a4bc7bf31296196c260526055d1ea0/pandas-2.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53", size = 11597846, upload-time = "2025-09-29T23:19:48.856Z" },
     { url = "https://files.pythonhosted.org/packages/5c/bd/bf8064d9cfa214294356c2d6702b716d3cf3bb24be59287a6a21e24cae6b/pandas-2.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35", size = 10729618, upload-time = "2025-09-29T23:39:08.659Z" },
     { url = "https://files.pythonhosted.org/packages/57/56/cf2dbe1a3f5271370669475ead12ce77c61726ffd19a35546e31aa8edf4e/pandas-2.3.3-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908", size = 11737212, upload-time = "2025-09-29T23:19:59.765Z" },
@@ -4502,33 +5464,50 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
     "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
-    "python_full_version < '3.13' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
 ]
 dependencies = [
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
-    { name = "python-dateutil", marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
-    { name = "tzdata", marker = "(sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev') or (sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "python-dateutil", marker = "(python_full_version >= '3.11' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tzdata", marker = "(python_full_version >= '3.11' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev') or (python_full_version >= '3.11' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev') or (sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/da/99/b342345300f13440fe9fe385c3c481e2d9a595ee3bab4d3219247ac94e9a/pandas-3.0.2.tar.gz", hash = "sha256:f4753e73e34c8d83221ba58f232433fca2748be8b18dbca02d242ed153945043", size = 4645855, upload-time = "2026-03-31T06:48:30.816Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/97/35/6411db530c618e0e0005187e35aa02ce60ae4c4c4d206964a2f978217c27/pandas-3.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a727a73cbdba2f7458dc82449e2315899d5140b449015d822f515749a46cbbe0", size = 10326926, upload-time = "2026-03-31T06:46:08.29Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/d3/b7da1d5d7dbdc5ef52ed7debd2b484313b832982266905315dad5a0bf0b1/pandas-3.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dbbd4aa20ca51e63b53bbde6a0fa4254b1aaabb74d2f542df7a7959feb1d760c", size = 9926987, upload-time = "2026-03-31T06:46:11.724Z" },
+    { url = "https://files.pythonhosted.org/packages/52/77/9b1c2d6070b5dbe239a7bc889e21bfa58720793fb902d1e070695d87c6d0/pandas-3.0.2-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:339dda302bd8369dedeae979cb750e484d549b563c3f54f3922cb8ff4978c5eb", size = 10757067, upload-time = "2026-03-31T06:46:14.903Z" },
+    { url = "https://files.pythonhosted.org/packages/20/17/ec40d981705654853726e7ac9aea9ddbb4a5d9cf54d8472222f4f3de06c2/pandas-3.0.2-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:61c2fd96d72b983a9891b2598f286befd4ad262161a609c92dc1652544b46b76", size = 11258787, upload-time = "2026-03-31T06:46:17.683Z" },
+    { url = "https://files.pythonhosted.org/packages/90/e3/3f1126d43d3702ca8773871a81c9f15122a1f412342cc56284ffda5b1f70/pandas-3.0.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c934008c733b8bbea273ea308b73b3156f0181e5b72960790b09c18a2794fe1e", size = 11771616, upload-time = "2026-03-31T06:46:20.532Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/cf/0f4e268e1f5062e44a6bda9f925806721cd4c95c2b808a4c82ebe914f96b/pandas-3.0.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:60a80bb4feacbef5e1447a3f82c33209c8b7e07f28d805cfd1fb951e5cb443aa", size = 12337623, upload-time = "2026-03-31T06:46:23.754Z" },
+    { url = "https://files.pythonhosted.org/packages/44/a0/97a6339859d4acb2536efb24feb6708e82f7d33b2ed7e036f2983fcced82/pandas-3.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:ed72cb3f45190874eb579c64fa92d9df74e98fd63e2be7f62bce5ace0ade61df", size = 9897372, upload-time = "2026-03-31T06:46:26.703Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/eb/781516b808a99ddf288143cec46b342b3016c3414d137da1fdc3290d8860/pandas-3.0.2-cp311-cp311-win_arm64.whl", hash = "sha256:f12b1a9e332c01e09510586f8ca9b108fd631fd656af82e452d7315ef6df5f9f", size = 9154922, upload-time = "2026-03-31T06:46:30.284Z" },
     { url = "https://files.pythonhosted.org/packages/f3/b0/c20bd4d6d3f736e6bd6b55794e9cd0a617b858eaad27c8f410ea05d953b7/pandas-3.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:232a70ebb568c0c4d2db4584f338c1577d81e3af63292208d615907b698a0f18", size = 10347921, upload-time = "2026-03-31T06:46:33.36Z" },
     { url = "https://files.pythonhosted.org/packages/35/d0/4831af68ce30cc2d03c697bea8450e3225a835ef497d0d70f31b8cdde965/pandas-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:970762605cff1ca0d3f71ed4f3a769ea8f85fc8e6348f6e110b8fea7e6eb5a14", size = 9888127, upload-time = "2026-03-31T06:46:36.253Z" },
     { url = "https://files.pythonhosted.org/packages/61/a9/16ea9346e1fc4a96e2896242d9bc674764fb9049b0044c0132502f7a771e/pandas-3.0.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aff4e6f4d722e0652707d7bcb190c445fe58428500c6d16005b02401764b1b3d", size = 10399577, upload-time = "2026-03-31T06:46:39.224Z" },
@@ -4609,6 +5588,28 @@ version = "12.2.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/8c/21/c2bcdd5906101a30244eaffc1b6e6ce71a31bd0742a01eb89e660ebfac2d/pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5", size = 46987819, upload-time = "2026-04-01T14:46:17.687Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/3a/aa/d0b28e1c811cd4d5f5c2bfe2e022292bd255ae5744a3b9ac7d6c8f72dd75/pillow-12.2.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:a4e8f36e677d3336f35089648c8955c51c6d386a13cf6ee9c189c5f5bd713a9f", size = 5354355, upload-time = "2026-04-01T14:42:15.402Z" },
+    { url = "https://files.pythonhosted.org/packages/27/8e/1d5b39b8ae2bd7650d0c7b6abb9602d16043ead9ebbfef4bc4047454da2a/pillow-12.2.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2e589959f10d9824d39b350472b92f0ce3b443c0a3442ebf41c40cb8361c5b97", size = 4695871, upload-time = "2026-04-01T14:42:18.234Z" },
+    { url = "https://files.pythonhosted.org/packages/f0/c5/dcb7a6ca6b7d3be41a76958e90018d56c8462166b3ef223150360850c8da/pillow-12.2.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a52edc8bfff4429aaabdf4d9ee0daadbbf8562364f940937b941f87a4290f5ff", size = 6269734, upload-time = "2026-04-01T14:42:20.608Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/f1/aa1bb13b2f4eba914e9637893c73f2af8e48d7d4023b9d3750d4c5eb2d0c/pillow-12.2.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:975385f4776fafde056abb318f612ef6285b10a1f12b8570f3647ad0d74b48ec", size = 8076080, upload-time = "2026-04-01T14:42:23.095Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/2a/8c79d6a53169937784604a8ae8d77e45888c41537f7f6f65ed1f407fe66d/pillow-12.2.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bd9c0c7a0c681a347b3194c500cb1e6ca9cab053ea4d82a5cf45b6b754560136", size = 6382236, upload-time = "2026-04-01T14:42:25.82Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/42/bbcb6051030e1e421d103ce7a8ecadf837aa2f39b8f82ef1a8d37c3d4ebc/pillow-12.2.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:88d387ff40b3ff7c274947ed3125dedf5262ec6919d83946753b5f3d7c67ea4c", size = 7070220, upload-time = "2026-04-01T14:42:28.68Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/e1/c2a7d6dd8cfa6b231227da096fd2d58754bab3603b9d73bf609d3c18b64f/pillow-12.2.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:51c4167c34b0d8ba05b547a3bb23578d0ba17b80a5593f93bd8ecb123dd336a3", size = 6493124, upload-time = "2026-04-01T14:42:31.579Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/41/7c8617da5d32e1d2f026e509484fdb6f3ad7efaef1749a0c1928adbb099e/pillow-12.2.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:34c0d99ecccea270c04882cb3b86e7b57296079c9a4aff88cb3b33563d95afaa", size = 7194324, upload-time = "2026-04-01T14:42:34.615Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/de/a777627e19fd6d62f84070ee1521adde5eeda4855b5cf60fe0b149118bca/pillow-12.2.0-cp310-cp310-win32.whl", hash = "sha256:b85f66ae9eb53e860a873b858b789217ba505e5e405a24b85c0464822fe88032", size = 6376363, upload-time = "2026-04-01T14:42:37.19Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/34/fc4cb5204896465842767b96d250c08410f01f2f28afc43b257de842eed5/pillow-12.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:673aa32138f3e7531ccdbca7b3901dba9b70940a19ccecc6a37c77d5fdeb05b5", size = 7083523, upload-time = "2026-04-01T14:42:39.62Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/a0/32852d36bc7709f14dc3f64f929a275e958ad8c19a6deba9610d458e28b3/pillow-12.2.0-cp310-cp310-win_arm64.whl", hash = "sha256:3e080565d8d7c671db5802eedfb438e5565ffa40115216eabb8cd52d0ecce024", size = 2463318, upload-time = "2026-04-01T14:42:42.063Z" },
+    { url = "https://files.pythonhosted.org/packages/68/e1/748f5663efe6edcfc4e74b2b93edfb9b8b99b67f21a854c3ae416500a2d9/pillow-12.2.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:8be29e59487a79f173507c30ddf57e733a357f67881430449bb32614075a40ab", size = 5354347, upload-time = "2026-04-01T14:42:44.255Z" },
+    { url = "https://files.pythonhosted.org/packages/47/a1/d5ff69e747374c33a3b53b9f98cca7889fce1fd03d79cdc4e1bccc6c5a87/pillow-12.2.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:71cde9a1e1551df7d34a25462fc60325e8a11a82cc2e2f54578e5e9a1e153d65", size = 4695873, upload-time = "2026-04-01T14:42:46.452Z" },
+    { url = "https://files.pythonhosted.org/packages/df/21/e3fbdf54408a973c7f7f89a23b2cb97a7ef30c61ab4142af31eee6aebc88/pillow-12.2.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f490f9368b6fc026f021db16d7ec2fbf7d89e2edb42e8ec09d2c60505f5729c7", size = 6280168, upload-time = "2026-04-01T14:42:49.228Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/f1/00b7278c7dd52b17ad4329153748f87b6756ec195ff786c2bdf12518337d/pillow-12.2.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8bd7903a5f2a4545f6fd5935c90058b89d30045568985a71c79f5fd6edf9b91e", size = 8088188, upload-time = "2026-04-01T14:42:51.735Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/cf/220a5994ef1b10e70e85748b75649d77d506499352be135a4989c957b701/pillow-12.2.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3997232e10d2920a68d25191392e3a4487d8183039e1c74c2297f00ed1c50705", size = 6394401, upload-time = "2026-04-01T14:42:54.343Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/bd/e51a61b1054f09437acfbc2ff9106c30d1eb76bc1453d428399946781253/pillow-12.2.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e74473c875d78b8e9d5da2a70f7099549f9eb37ded4e2f6a463e60125bccd176", size = 7079655, upload-time = "2026-04-01T14:42:56.954Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/3d/45132c57d5fb4b5744567c3817026480ac7fc3ce5d4c47902bc0e7f6f853/pillow-12.2.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:56a3f9c60a13133a98ecff6197af34d7824de9b7b38c3654861a725c970c197b", size = 6503105, upload-time = "2026-04-01T14:42:59.847Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/2e/9df2fc1e82097b1df3dce58dc43286aa01068e918c07574711fcc53e6fb4/pillow-12.2.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:90e6f81de50ad6b534cab6e5aef77ff6e37722b2f5d908686f4a5c9eba17a909", size = 7203402, upload-time = "2026-04-01T14:43:02.664Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/2e/2941e42858ebb67e50ae741473de81c2984e6eff7b397017623c676e2e8d/pillow-12.2.0-cp311-cp311-win32.whl", hash = "sha256:8c984051042858021a54926eb597d6ee3012393ce9c181814115df4c60b9a808", size = 6378149, upload-time = "2026-04-01T14:43:05.274Z" },
+    { url = "https://files.pythonhosted.org/packages/69/42/836b6f3cd7f3e5fa10a1f1a5420447c17966044c8fbf589cc0452d5502db/pillow-12.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:6e6b2a0c538fc200b38ff9eb6628228b77908c319a005815f2dde585a0664b60", size = 7082626, upload-time = "2026-04-01T14:43:08.557Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/88/549194b5d6f1f494b485e493edc6693c0a16f4ada488e5bd974ed1f42fad/pillow-12.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:9a8a34cc89c67a65ea7437ce257cea81a9dad65b29805f3ecee8c8fe8ff25ffe", size = 2463531, upload-time = "2026-04-01T14:43:10.743Z" },
     { url = "https://files.pythonhosted.org/packages/58/be/7482c8a5ebebbc6470b3eb791812fff7d5e0216c2be3827b30b8bb6603ed/pillow-12.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2d192a155bbcec180f8564f693e6fd9bccff5a7af9b32e2e4bf8c9c69dbad6b5", size = 5308279, upload-time = "2026-04-01T14:43:13.246Z" },
     { url = "https://files.pythonhosted.org/packages/d8/95/0a351b9289c2b5cbde0bacd4a83ebc44023e835490a727b2a3bd60ddc0f4/pillow-12.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3f40b3c5a968281fd507d519e444c35f0ff171237f4fdde090dd60699458421", size = 4695490, upload-time = "2026-04-01T14:43:15.584Z" },
     { url = "https://files.pythonhosted.org/packages/de/af/4e8e6869cbed569d43c416fad3dc4ecb944cb5d9492defaed89ddd6fe871/pillow-12.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:03e7e372d5240cc23e9f07deca4d775c0817bffc641b01e9c3af208dbd300987", size = 6284462, upload-time = "2026-04-01T14:43:18.268Z" },
@@ -4670,15 +5671,22 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c9/e4/4b64a97d71b2a83158134abbb2f5bd3f8a2ea691361282f010998f339ec7/pillow-12.2.0-cp314-cp314t-win32.whl", hash = "sha256:6bb77b2dcb06b20f9f4b4a8454caa581cd4dd0643a08bacf821216a16d9c8354", size = 6482084, upload-time = "2026-04-01T14:45:47.568Z" },
     { url = "https://files.pythonhosted.org/packages/ba/13/306d275efd3a3453f72114b7431c877d10b1154014c1ebbedd067770d629/pillow-12.2.0-cp314-cp314t-win_amd64.whl", hash = "sha256:6562ace0d3fb5f20ed7290f1f929cae41b25ae29528f2af1722966a0a02e2aa1", size = 7225152, upload-time = "2026-04-01T14:45:50.032Z" },
     { url = "https://files.pythonhosted.org/packages/ff/6e/cf826fae916b8658848d7b9f38d88da6396895c676e8086fc0988073aaf8/pillow-12.2.0-cp314-cp314t-win_arm64.whl", hash = "sha256:aa88ccfe4e32d362816319ed727a004423aab09c5cea43c01a4b435643fa34eb", size = 2556579, upload-time = "2026-04-01T14:45:52.529Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/b7/2437044fb910f499610356d1352e3423753c98e34f915252aafecc64889f/pillow-12.2.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0538bd5e05efec03ae613fd89c4ce0368ecd2ba239cc25b9f9be7ed426b0af1f", size = 5273969, upload-time = "2026-04-01T14:45:55.538Z" },
+    { url = "https://files.pythonhosted.org/packages/f6/f4/8316e31de11b780f4ac08ef3654a75555e624a98db1056ecb2122d008d5a/pillow-12.2.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:394167b21da716608eac917c60aa9b969421b5dcbbe02ae7f013e7b85811c69d", size = 4659674, upload-time = "2026-04-01T14:45:58.093Z" },
+    { url = "https://files.pythonhosted.org/packages/d4/37/664fca7201f8bb2aa1d20e2c3d5564a62e6ae5111741966c8319ca802361/pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:5d04bfa02cc2d23b497d1e90a0f927070043f6cbf303e738300532379a4b4e0f", size = 5288479, upload-time = "2026-04-01T14:46:01.141Z" },
+    { url = "https://files.pythonhosted.org/packages/49/62/5b0ed78fce87346be7a5cfcfaaad91f6a1f98c26f86bdbafa2066c647ef6/pillow-12.2.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0c838a5125cee37e68edec915651521191cef1e6aa336b855f495766e77a366e", size = 7032230, upload-time = "2026-04-01T14:46:03.874Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/28/ec0fc38107fc32536908034e990c47914c57cd7c5a3ece4d8d8f7ffd7e27/pillow-12.2.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4a6c9fa44005fa37a91ebfc95d081e8079757d2e904b27103f4f5fa6f0bf78c0", size = 5355404, upload-time = "2026-04-01T14:46:06.33Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/8b/51b0eddcfa2180d60e41f06bd6d0a62202b20b59c68f5a132e615b75aecf/pillow-12.2.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:25373b66e0dd5905ed63fa3cae13c82fbddf3079f2c8bf15c6fb6a35586324c1", size = 6002215, upload-time = "2026-04-01T14:46:08.83Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/60/5382c03e1970de634027cee8e1b7d39776b778b81812aaf45b694dfe9e28/pillow-12.2.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:bfa9c230d2fe991bed5318a5f119bd6780cda2915cca595393649fc118ab895e", size = 7080946, upload-time = "2026-04-01T14:46:11.734Z" },
 ]
 
 [[package]]
 name = "platformdirs"
-version = "4.9.6"
+version = "4.9.4"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/9f/4a/0883b8e3802965322523f0b200ecf33d31f10991d0401162f4b23c698b42/platformdirs-4.9.6.tar.gz", hash = "sha256:3bfa75b0ad0db84096ae777218481852c0ebc6c727b3168c1b9e0118e458cf0a", size = 29400, upload-time = "2026-04-09T00:04:10.812Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/19/56/8d4c30c8a1d07013911a8fdbd8f89440ef9f08d07a1b50ab8ca8be5a20f9/platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934", size = 28737, upload-time = "2026-03-05T18:34:13.271Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/75/a6/a0a304dc33b49145b21f4808d763822111e67d1c3a32b524a1baf947b6e1/platformdirs-4.9.6-py3-none-any.whl", hash = "sha256:e61adb1d5e5cb3441b4b7710bea7e4c12250ca49439228cc1021c00dcfac0917", size = 21348, upload-time = "2026-04-09T00:04:09.463Z" },
+    { url = "https://files.pythonhosted.org/packages/63/d7/97f7e3a6abb67d8080dd406fd4df842c2be0efaf712d1c899c32a075027c/platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868", size = 21216, upload-time = "2026-03-05T18:34:12.172Z" },
 ]
 
 [[package]]
@@ -4713,11 +5721,11 @@ wheels = [
 
 [[package]]
 name = "prometheus-client"
-version = "0.25.0"
+version = "0.24.1"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/1b/fb/d9aa83ffe43ce1f19e557c0971d04b90561b0cfd50762aafb01968285553/prometheus_client-0.25.0.tar.gz", hash = "sha256:5e373b75c31afb3c86f1a52fa1ad470c9aace18082d39ec0d2f918d11cc9ba28", size = 86035, upload-time = "2026-04-09T19:53:42.359Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/58/a794d23feb6b00fc0c72787d7e87d872a6730dd9ed7c7b3e954637d8f280/prometheus_client-0.24.1.tar.gz", hash = "sha256:7e0ced7fbbd40f7b84962d5d2ab6f17ef88a72504dcf7c0b40737b43b2a461f9", size = 85616, upload-time = "2026-01-14T15:26:26.965Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8d/9b/d4b1e644385499c8346fa9b622a3f030dce14cd6ef8a1871c221a17a67e7/prometheus_client-0.25.0-py3-none-any.whl", hash = "sha256:d5aec89e349a6ec230805d0df882f3807f74fd6c1a2fa86864e3c2279059fed1", size = 64154, upload-time = "2026-04-09T19:53:41.324Z" },
+    { url = "https://files.pythonhosted.org/packages/74/c3/24a2f845e3917201628ecaba4f18bab4d18a337834c1df2a159ee9d22a42/prometheus_client-0.24.1-py3-none-any.whl", hash = "sha256:150db128af71a5c2482b36e588fc8a6b95e498750da4b17065947c16070f4055", size = 64057, upload-time = "2026-01-14T15:26:24.42Z" },
 ]
 
 [[package]]
@@ -4751,6 +5759,36 @@ version = "0.4.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/9e/da/e9fc233cf63743258bff22b3dfa7ea5baef7b5bc324af47a0ad89b8ffc6f/propcache-0.4.1.tar.gz", hash = "sha256:f48107a8c637e80362555f37ecf49abe20370e557cc4ab374f04ec4423c97c3d", size = 46442, upload-time = "2025-10-08T19:49:02.291Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/3c/0e/934b541323035566a9af292dba85a195f7b78179114f2c6ebb24551118a9/propcache-0.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c2d1fa3201efaf55d730400d945b5b3ab6e672e100ba0f9a409d950ab25d7db", size = 79534, upload-time = "2025-10-08T19:46:02.083Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/6b/db0d03d96726d995dc7171286c6ba9d8d14251f37433890f88368951a44e/propcache-0.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1eb2994229cc8ce7fe9b3db88f5465f5fd8651672840b2e426b88cdb1a30aac8", size = 45526, upload-time = "2025-10-08T19:46:03.884Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/c3/82728404aea669e1600f304f2609cde9e665c18df5a11cdd57ed73c1dceb/propcache-0.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:66c1f011f45a3b33d7bcb22daed4b29c0c9e2224758b6be00686731e1b46f925", size = 47263, upload-time = "2025-10-08T19:46:05.405Z" },
+    { url = "https://files.pythonhosted.org/packages/df/1b/39313ddad2bf9187a1432654c38249bab4562ef535ef07f5eb6eb04d0b1b/propcache-0.4.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9a52009f2adffe195d0b605c25ec929d26b36ef986ba85244891dee3b294df21", size = 201012, upload-time = "2025-10-08T19:46:07.165Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/01/f1d0b57d136f294a142acf97f4ed58c8e5b974c21e543000968357115011/propcache-0.4.1-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:5d4e2366a9c7b837555cf02fb9be2e3167d333aff716332ef1b7c3a142ec40c5", size = 209491, upload-time = "2025-10-08T19:46:08.909Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/c8/038d909c61c5bb039070b3fb02ad5cccdb1dde0d714792e251cdb17c9c05/propcache-0.4.1-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:9d2b6caef873b4f09e26ea7e33d65f42b944837563a47a94719cc3544319a0db", size = 215319, upload-time = "2025-10-08T19:46:10.7Z" },
+    { url = "https://files.pythonhosted.org/packages/08/57/8c87e93142b2c1fa2408e45695205a7ba05fb5db458c0bf5c06ba0e09ea6/propcache-0.4.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b16ec437a8c8a965ecf95739448dd938b5c7f56e67ea009f4300d8df05f32b7", size = 196856, upload-time = "2025-10-08T19:46:12.003Z" },
+    { url = "https://files.pythonhosted.org/packages/42/df/5615fec76aa561987a534759b3686008a288e73107faa49a8ae5795a9f7a/propcache-0.4.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:296f4c8ed03ca7476813fe666c9ea97869a8d7aec972618671b33a38a5182ef4", size = 193241, upload-time = "2025-10-08T19:46:13.495Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/21/62949eb3a7a54afe8327011c90aca7e03547787a88fb8bd9726806482fea/propcache-0.4.1-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:1f0978529a418ebd1f49dad413a2b68af33f85d5c5ca5c6ca2a3bed375a7ac60", size = 190552, upload-time = "2025-10-08T19:46:14.938Z" },
+    { url = "https://files.pythonhosted.org/packages/30/ee/ab4d727dd70806e5b4de96a798ae7ac6e4d42516f030ee60522474b6b332/propcache-0.4.1-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fd138803047fb4c062b1c1dd95462f5209456bfab55c734458f15d11da288f8f", size = 200113, upload-time = "2025-10-08T19:46:16.695Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/0b/38b46208e6711b016aa8966a3ac793eee0d05c7159d8342aa27fc0bc365e/propcache-0.4.1-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:8c9b3cbe4584636d72ff556d9036e0c9317fa27b3ac1f0f558e7e84d1c9c5900", size = 200778, upload-time = "2025-10-08T19:46:18.023Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/81/5abec54355ed344476bee711e9f04815d4b00a311ab0535599204eecc257/propcache-0.4.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f93243fdc5657247533273ac4f86ae106cc6445a0efacb9a1bfe982fcfefd90c", size = 193047, upload-time = "2025-10-08T19:46:19.449Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/b6/1f237c04e32063cb034acd5f6ef34ef3a394f75502e72703545631ab1ef6/propcache-0.4.1-cp310-cp310-win32.whl", hash = "sha256:a0ee98db9c5f80785b266eb805016e36058ac72c51a064040f2bc43b61101cdb", size = 38093, upload-time = "2025-10-08T19:46:20.643Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/67/354aac4e0603a15f76439caf0427781bcd6797f370377f75a642133bc954/propcache-0.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:1cdb7988c4e5ac7f6d175a28a9aa0c94cb6f2ebe52756a3c0cda98d2809a9e37", size = 41638, upload-time = "2025-10-08T19:46:21.935Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/e1/74e55b9fd1a4c209ff1a9a824bf6c8b3d1fc5a1ac3eabe23462637466785/propcache-0.4.1-cp310-cp310-win_arm64.whl", hash = "sha256:d82ad62b19645419fe79dd63b3f9253e15b30e955c0170e5cebc350c1844e581", size = 38229, upload-time = "2025-10-08T19:46:23.368Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/d4/4e2c9aaf7ac2242b9358f98dccd8f90f2605402f5afeff6c578682c2c491/propcache-0.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:60a8fda9644b7dfd5dece8c61d8a85e271cb958075bfc4e01083c148b61a7caf", size = 80208, upload-time = "2025-10-08T19:46:24.597Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/21/d7b68e911f9c8e18e4ae43bdbc1e1e9bbd971f8866eb81608947b6f585ff/propcache-0.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c30b53e7e6bda1d547cabb47c825f3843a0a1a42b0496087bb58d8fedf9f41b5", size = 45777, upload-time = "2025-10-08T19:46:25.733Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/1d/11605e99ac8ea9435651ee71ab4cb4bf03f0949586246476a25aadfec54a/propcache-0.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6918ecbd897443087a3b7cd978d56546a812517dcaaca51b49526720571fa93e", size = 47647, upload-time = "2025-10-08T19:46:27.304Z" },
+    { url = "https://files.pythonhosted.org/packages/58/1a/3c62c127a8466c9c843bccb503d40a273e5cc69838805f322e2826509e0d/propcache-0.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3d902a36df4e5989763425a8ab9e98cd8ad5c52c823b34ee7ef307fd50582566", size = 214929, upload-time = "2025-10-08T19:46:28.62Z" },
+    { url = "https://files.pythonhosted.org/packages/56/b9/8fa98f850960b367c4b8fe0592e7fc341daa7a9462e925228f10a60cf74f/propcache-0.4.1-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a9695397f85973bb40427dedddf70d8dc4a44b22f1650dd4af9eedf443d45165", size = 221778, upload-time = "2025-10-08T19:46:30.358Z" },
+    { url = "https://files.pythonhosted.org/packages/46/a6/0ab4f660eb59649d14b3d3d65c439421cf2f87fe5dd68591cbe3c1e78a89/propcache-0.4.1-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2bb07ffd7eaad486576430c89f9b215f9e4be68c4866a96e97db9e97fead85dc", size = 228144, upload-time = "2025-10-08T19:46:32.607Z" },
+    { url = "https://files.pythonhosted.org/packages/52/6a/57f43e054fb3d3a56ac9fc532bc684fc6169a26c75c353e65425b3e56eef/propcache-0.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:fd6f30fdcf9ae2a70abd34da54f18da086160e4d7d9251f81f3da0ff84fc5a48", size = 210030, upload-time = "2025-10-08T19:46:33.969Z" },
+    { url = "https://files.pythonhosted.org/packages/40/e2/27e6feebb5f6b8408fa29f5efbb765cd54c153ac77314d27e457a3e993b7/propcache-0.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:fc38cba02d1acba4e2869eef1a57a43dfbd3d49a59bf90dda7444ec2be6a5570", size = 208252, upload-time = "2025-10-08T19:46:35.309Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/f8/91c27b22ccda1dbc7967f921c42825564fa5336a01ecd72eb78a9f4f53c2/propcache-0.4.1-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:67fad6162281e80e882fb3ec355398cf72864a54069d060321f6cd0ade95fe85", size = 202064, upload-time = "2025-10-08T19:46:36.993Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/26/7f00bd6bd1adba5aafe5f4a66390f243acab58eab24ff1a08bebb2ef9d40/propcache-0.4.1-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f10207adf04d08bec185bae14d9606a1444715bc99180f9331c9c02093e1959e", size = 212429, upload-time = "2025-10-08T19:46:38.398Z" },
+    { url = "https://files.pythonhosted.org/packages/84/89/fd108ba7815c1117ddca79c228f3f8a15fc82a73bca8b142eb5de13b2785/propcache-0.4.1-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:e9b0d8d0845bbc4cfcdcbcdbf5086886bc8157aa963c31c777ceff7846c77757", size = 216727, upload-time = "2025-10-08T19:46:39.732Z" },
+    { url = "https://files.pythonhosted.org/packages/79/37/3ec3f7e3173e73f1d600495d8b545b53802cbf35506e5732dd8578db3724/propcache-0.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:981333cb2f4c1896a12f4ab92a9cc8f09ea664e9b7dbdc4eff74627af3a11c0f", size = 205097, upload-time = "2025-10-08T19:46:41.025Z" },
+    { url = "https://files.pythonhosted.org/packages/61/b0/b2631c19793f869d35f47d5a3a56fb19e9160d3c119f15ac7344fc3ccae7/propcache-0.4.1-cp311-cp311-win32.whl", hash = "sha256:f1d2f90aeec838a52f1c1a32fe9a619fefd5e411721a9117fbf82aea638fe8a1", size = 38084, upload-time = "2025-10-08T19:46:42.693Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/78/6cce448e2098e9f3bfc91bb877f06aa24b6ccace872e39c53b2f707c4648/propcache-0.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:364426a62660f3f699949ac8c621aad6977be7126c5807ce48c0aeb8e7333ea6", size = 41637, upload-time = "2025-10-08T19:46:43.778Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/e9/754f180cccd7f51a39913782c74717c581b9cc8177ad0e949f4d51812383/propcache-0.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:e53f3a38d3510c11953f3e6a33f205c6d1b001129f972805ca9b42fc308bc239", size = 38064, upload-time = "2025-10-08T19:46:44.872Z" },
     { url = "https://files.pythonhosted.org/packages/a2/0f/f17b1b2b221d5ca28b4b876e8bb046ac40466513960646bda8e1853cdfa2/propcache-0.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e153e9cd40cc8945138822807139367f256f89c6810c2634a4f6902b52d3b4e2", size = 80061, upload-time = "2025-10-08T19:46:46.075Z" },
     { url = "https://files.pythonhosted.org/packages/76/47/8ccf75935f51448ba9a16a71b783eb7ef6b9ee60f5d14c7f8a8a79fbeed7/propcache-0.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:cd547953428f7abb73c5ad82cbb32109566204260d98e41e5dfdc682eb7f8403", size = 46037, upload-time = "2025-10-08T19:46:47.23Z" },
     { url = "https://files.pythonhosted.org/packages/0a/b6/5c9a0e42df4d00bfb4a3cbbe5cf9f54260300c88a0e9af1f47ca5ce17ac0/propcache-0.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f048da1b4f243fc44f205dfd320933a951b8d89e0afd4c7cacc762a8b9165207", size = 47324, upload-time = "2025-10-08T19:46:48.384Z" },
@@ -4914,6 +5952,20 @@ version = "23.0.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336, upload-time = "2026-02-16T10:14:12.39Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/bc/a8/24e5dc6855f50a62936ceb004e6e9645e4219a8065f304145d7fb8a79d5d/pyarrow-23.0.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:3fab8f82571844eb3c460f90a75583801d14ca0cc32b1acc8c361650e006fd56", size = 34307390, upload-time = "2026-02-16T10:08:08.654Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/8e/4be5617b4aaae0287f621ad31c6036e5f63118cfca0dc57d42121ff49b51/pyarrow-23.0.1-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:3f91c038b95f71ddfc865f11d5876c42f343b4495535bd262c7b321b0b94507c", size = 35853761, upload-time = "2026-02-16T10:08:17.811Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/08/3e56a18819462210432ae37d10f5c8eed3828be1d6c751b6e6a2e93c286a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:d0744403adabef53c985a7f8a082b502a368510c40d184df349a0a8754533258", size = 44493116, upload-time = "2026-02-16T10:08:25.792Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/82/c40b68001dbec8a3faa4c08cd8c200798ac732d2854537c5449dc859f55a/pyarrow-23.0.1-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c33b5bf406284fd0bba436ed6f6c3ebe8e311722b441d89397c54f871c6863a2", size = 47564532, upload-time = "2026-02-16T10:08:34.27Z" },
+    { url = "https://files.pythonhosted.org/packages/20/bc/73f611989116b6f53347581b02177f9f620efdf3cd3f405d0e83cdf53a83/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ddf743e82f69dcd6dbbcb63628895d7161e04e56794ef80550ac6f3315eeb1d5", size = 48183685, upload-time = "2026-02-16T10:08:42.889Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/cc/6c6b3ecdae2a8c3aced99956187e8302fc954cc2cca2a37cf2111dad16ce/pyarrow-23.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e052a211c5ac9848ae15d5ec875ed0943c0221e2fcfe69eee80b604b4e703222", size = 50605582, upload-time = "2026-02-16T10:08:51.641Z" },
+    { url = "https://files.pythonhosted.org/packages/8d/94/d359e708672878d7638a04a0448edf7c707f9e5606cee11e15aaa5c7535a/pyarrow-23.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:5abde149bb3ce524782d838eb67ac095cd3fd6090eba051130589793f1a7f76d", size = 27521148, upload-time = "2026-02-16T10:08:58.077Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/41/8e6b6ef7e225d4ceead8459427a52afdc23379768f54dd3566014d7618c1/pyarrow-23.0.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:6f0147ee9e0386f519c952cc670eb4a8b05caa594eeffe01af0e25f699e4e9bb", size = 34302230, upload-time = "2026-02-16T10:09:03.859Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/4a/1472c00392f521fea03ae93408bf445cc7bfa1ab81683faf9bc188e36629/pyarrow-23.0.1-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:0ae6e17c828455b6265d590100c295193f93cc5675eb0af59e49dbd00d2de350", size = 35850050, upload-time = "2026-02-16T10:09:11.877Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/b2/bd1f2f05ded56af7f54d702c8364c9c43cd6abb91b0e9933f3d77b4f4132/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:fed7020203e9ef273360b9e45be52a2a47d3103caf156a30ace5247ffb51bdbd", size = 44491918, upload-time = "2026-02-16T10:09:18.144Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/62/96459ef5b67957eac38a90f541d1c28833d1b367f014a482cb63f3b7cd2d/pyarrow-23.0.1-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:26d50dee49d741ac0e82185033488d28d35be4d763ae6f321f97d1140eb7a0e9", size = 47562811, upload-time = "2026-02-16T10:09:25.792Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/94/1170e235add1f5f45a954e26cd0e906e7e74e23392dcb560de471f7366ec/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c30143b17161310f151f4a2bcfe41b5ff744238c1039338779424e38579d701", size = 48183766, upload-time = "2026-02-16T10:09:34.645Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/2d/39a42af4570377b99774cdb47f63ee6c7da7616bd55b3d5001aa18edfe4f/pyarrow-23.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:db2190fa79c80a23fdd29fef4b8992893f024ae7c17d2f5f4db7171fa30c2c78", size = 50607669, upload-time = "2026-02-16T10:09:44.153Z" },
+    { url = "https://files.pythonhosted.org/packages/00/ca/db94101c187f3df742133ac837e93b1f269ebdac49427f8310ee40b6a58f/pyarrow-23.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:f00f993a8179e0e1c9713bcc0baf6d6c01326a406a9c23495ec1ba9c9ebf2919", size = 27527698, upload-time = "2026-02-16T10:09:50.263Z" },
     { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575, upload-time = "2026-02-16T10:09:56.225Z" },
     { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540, upload-time = "2026-02-16T10:10:03.428Z" },
     { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940, upload-time = "2026-02-16T10:10:10.704Z" },
@@ -4974,11 +6026,11 @@ wheels = [
 
 [[package]]
 name = "pybind11"
-version = "3.0.4"
+version = "3.0.3"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/cc/f0/35145a3c3baffeef55d4b8324caa33abaa8fa56ab345ecd4b2211d09163e/pybind11-3.0.4.tar.gz", hash = "sha256:3286b59c8a774b9ee650169302dd5a4eedc30a8617905a0560dd8ee44775130c", size = 589533, upload-time = "2026-04-19T03:08:15.925Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/41/50/b83d65efc1914681f5aded4ce37c703408a9bb74829f27f041560ca52ffb/pybind11-3.0.3.tar.gz", hash = "sha256:00471cdb816882c484708bc5dde80815c8c11cea540ab2cc6410f5ddea434755", size = 587814, upload-time = "2026-03-31T23:42:06.481Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b3/06/c3a23c9a0263b136c519f033a58d4641e73065fefc7754e9667ec206d992/pybind11-3.0.4-py3-none-any.whl", hash = "sha256:961720ee652da51d531b7b2451a6bd2bc042b0106e6d9baa48ecb7d58034ce63", size = 314166, upload-time = "2026-04-19T03:08:14.091Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/87/99f21e9b20899d6dc1bf7544cfe53e5fa17acc21bb267971a540425357d3/pybind11-3.0.3-py3-none-any.whl", hash = "sha256:fb5f8e4a64946b4dcc0451c83a8c384f803bc0a62dd1ba02f199e97dbc9aad4c", size = 313717, upload-time = "2026-03-31T23:42:04.814Z" },
 ]
 
 [[package]]
@@ -5001,7 +6053,7 @@ wheels = [
 
 [[package]]
 name = "pydantic"
-version = "2.13.2"
+version = "2.12.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-types" },
@@ -5009,84 +6061,127 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "typing-inspection" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/09/e5/06d23afac9973109d1e3c8ad38e1547a12e860610e327c05ee686827dc37/pydantic-2.13.2.tar.gz", hash = "sha256:b418196607e61081c3226dcd4f0672f2a194828abb9109e9cfb84026564df2d1", size = 843836, upload-time = "2026-04-17T09:31:59.636Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/69/44/36f1a6e523abc58ae5f928898e4aca2e0ea509b5aa6f6f392a5d882be928/pydantic-2.12.5.tar.gz", hash = "sha256:4d351024c75c0f085a9febbb665ce8c0c6ec5d30e903bdb6394b7ede26aebb49", size = 821591, upload-time = "2025-11-26T15:11:46.471Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/ca/b45c378e6e8d0b90577288b533e04e95b7afd61bb1d51b6c263176435489/pydantic-2.13.2-py3-none-any.whl", hash = "sha256:a525087f4c03d7e7456a3de89b64cd693d2229933bb1068b9af6befd5563694e", size = 471947, upload-time = "2026-04-17T09:31:57.541Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/87/b70ad306ebb6f9b585f114d0ac2137d792b48be34d732d60e597c2f8465a/pydantic-2.12.5-py3-none-any.whl", hash = "sha256:e561593fccf61e8a20fc46dfc2dfe075b8be7d0188df33f221ad1f0139180f9d", size = 463580, upload-time = "2025-11-26T15:11:44.605Z" },
 ]
 
 [[package]]
 name = "pydantic-core"
-version = "2.46.2"
+version = "2.41.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/43/bb/4742f05b739b2478459bb16fa8470549518c802e06ddcf3f106c5081315e/pydantic_core-2.46.2.tar.gz", hash = "sha256:37bb079f9ee3f1a519392b73fda2a96379b31f2013c6b467fe693e7f2987f596", size = 471269, upload-time = "2026-04-17T09:10:07.017Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/97/ec/2fafa4c86f5d2a69372c7cddef30925fd0e370b1efaf556609c1a0196d8a/pydantic_core-2.46.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:ea1ad8c89da31512fe2d249cf0638fb666925bda341901541bc5f3311c6fcc9e", size = 2101729, upload-time = "2026-04-17T09:12:30.042Z" },
-    { url = "https://files.pythonhosted.org/packages/cf/55/be5386c2c4b49af346e8a26b748194ff25757bbb6cf544130854e997af7a/pydantic_core-2.46.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b308da17b92481e0587244631c5529e5d91d04cb2b08194825627b1eca28e21e", size = 1951546, upload-time = "2026-04-17T09:10:10.585Z" },
-    { url = "https://files.pythonhosted.org/packages/29/92/89e273a055ce440e6636c756379af35ad86da9d336a560049c3ba5e41c80/pydantic_core-2.46.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d333a50bdd814a917d8d6a7ee35ba2395d53ddaa882613bc24e54a9d8b129095", size = 1976178, upload-time = "2026-04-17T09:11:49.619Z" },
-    { url = "https://files.pythonhosted.org/packages/91/b3/e4664469cf70c0cb0f7b2f5719d64e5968bb6f38217042c2afa3d3c4ba17/pydantic_core-2.46.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d00b99590c5bd1fabbc5d28b170923e32c1b1071b1f1de1851a4d14d89eb192", size = 2051697, upload-time = "2026-04-17T09:12:04.917Z" },
-    { url = "https://files.pythonhosted.org/packages/98/58/dbf68213ee06ce51cdd6d8c95f97980e646858c45bd96bd2dfb40433be73/pydantic_core-2.46.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9f0e686960ffe9e65066395af856ac2d52c159043144433602c50c221d81c1ba", size = 2233160, upload-time = "2026-04-17T09:12:00.956Z" },
-    { url = "https://files.pythonhosted.org/packages/f5/d3/68092aa0ee6c60ff4de4740eb82db3d4ce338ec89b3cecb978c532472f12/pydantic_core-2.46.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2d1128da41c9cb474e0a4701f9c363ec645c9d1a02229904c76bf4e0a194fde2", size = 2298398, upload-time = "2026-04-17T09:10:29.694Z" },
-    { url = "https://files.pythonhosted.org/packages/e4/51/5d6155eb737db55b0ad354ca5f333ef009f75feb67df2d79a84bace45af6/pydantic_core-2.46.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48649cf2d8c358d79586e9fb2f8235902fcaa2d969ec1c5301f2d1873b2f8321", size = 2094058, upload-time = "2026-04-17T09:12:10.995Z" },
-    { url = "https://files.pythonhosted.org/packages/6b/f3/eb4a986197d71319430464ff181226c95adc8f06d932189b158bae5a82f5/pydantic_core-2.46.2-cp312-cp312-manylinux_2_31_riscv64.whl", hash = "sha256:b902f0fc7c2cf503865a05718b68147c6cd5d0a3867af38c527be574a9fa6e9d", size = 2130388, upload-time = "2026-04-17T09:12:41.159Z" },
-    { url = "https://files.pythonhosted.org/packages/56/00/44a9c4fe6d0f64b5786d6a8c649d6f0e34ba6c89b3663add1066e54451a2/pydantic_core-2.46.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e80011f808b03d1d87a8f1e76ae3da19a18eb706c823e17981dcf1fae43744fc", size = 2184245, upload-time = "2026-04-17T09:12:36.532Z" },
-    { url = "https://files.pythonhosted.org/packages/78/6b/685b98a834d5e3d1c34a1bde1627525559dd223b75075bc7490cdb24eb33/pydantic_core-2.46.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b839d5c802e31348b949b6473f8190cddbf7d47475856d8ac995a373ee16ec59", size = 2186842, upload-time = "2026-04-17T09:13:04.054Z" },
-    { url = "https://files.pythonhosted.org/packages/22/64/caa2f5a2ac8b6113adaa410ccdf31ba7f54897a6e54cd0d726fc7e780c88/pydantic_core-2.46.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:c6b1064f3f9cf9072e1d59dd2936f9f3b668bec1c37039708c9222db703c0d5b", size = 2336066, upload-time = "2026-04-17T09:12:13.006Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/f9/7d2701bf82945b5b9e7df8347be97ef6a36da2846bfe5b4afec299ffe27b/pydantic_core-2.46.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a68e6f2ac95578ce3c0564802404b27b24988649616e556c07e77111ed3f1d", size = 2363691, upload-time = "2026-04-17T09:13:42.972Z" },
-    { url = "https://files.pythonhosted.org/packages/3b/65/0dab11574101522941055109419db3cc09db871643dc3fc74e2413215e5b/pydantic_core-2.46.2-cp312-cp312-win32.whl", hash = "sha256:d9ffa75a7ef4b97d6e5e205fabd4304ef01fec09e6f1bdde04b9ad1b07d20289", size = 1958801, upload-time = "2026-04-17T09:11:31.981Z" },
-    { url = "https://files.pythonhosted.org/packages/13/2b/df84baa609c676f6450b8ecad44ea59146c805e3371b7b52443c0899f989/pydantic_core-2.46.2-cp312-cp312-win_amd64.whl", hash = "sha256:0551f2d2ddb68af5a00e26497f8025c538f73ef3cb698f8e5a487042cd2792a8", size = 2072634, upload-time = "2026-04-17T09:11:02.407Z" },
-    { url = "https://files.pythonhosted.org/packages/d1/4e/e1ce8029fc438086a946739bf9d596f70ff470aad4a8345555920618cabe/pydantic_core-2.46.2-cp312-cp312-win_arm64.whl", hash = "sha256:83aef30f106edcc21a6a4cc44b82d3169a1dbe255508db788e778f3c804d3583", size = 2026188, upload-time = "2026-04-17T09:13:11.083Z" },
-    { url = "https://files.pythonhosted.org/packages/07/2b/662e48254479a2d3450ba24b1e25061108b64339794232f503990c519144/pydantic_core-2.46.2-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:d26e9eea3715008a09a74585fe9becd0c67fbb145dc4df9756d597d7230a652c", size = 2101762, upload-time = "2026-04-17T09:10:13.87Z" },
-    { url = "https://files.pythonhosted.org/packages/73/ab/bafd7c7503757ccc8ec4d1911e106fe474c629443648c51a88f08b0fe91a/pydantic_core-2.46.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:48b36e3235140510dc7861f0cd58b714b1cdd3d48f75e10ce52e69866b746f10", size = 1951814, upload-time = "2026-04-17T09:12:25.934Z" },
-    { url = "https://files.pythonhosted.org/packages/92/cc/7549c2d57ba2e9a42caa5861a2d398dbe31c02c6aca783253ace59ce84f8/pydantic_core-2.46.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:36b1f99dc451f1a3981f236151465bcf995bbe712d0727c9f7b236fe228a8133", size = 1977329, upload-time = "2026-04-17T09:13:37.605Z" },
-    { url = "https://files.pythonhosted.org/packages/18/50/7ed4a8a0d478a4dca8f0134a5efa7193f03cc8520dd4c9509339fb2e5002/pydantic_core-2.46.2-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8641c8d535c2d95b45c2e19b646ecd23ebba35d461e0ae48a3498277006250ab", size = 2051832, upload-time = "2026-04-17T09:12:49.771Z" },
-    { url = "https://files.pythonhosted.org/packages/dc/16/bb35b193741c0298ddc5f5e4234269efdc0c65e2bcd198aa0de9b68845e4/pydantic_core-2.46.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:20fb194788a0a50993e87013e693494ba183a2af5b44e99cf060bbae10912b11", size = 2233127, upload-time = "2026-04-17T09:11:04.449Z" },
-    { url = "https://files.pythonhosted.org/packages/91/a5/98f4b637149185addea19e1785ea20c373cca31b202f589111d8209d9873/pydantic_core-2.46.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9262d11d0cd11ee3303a95156939402bed6cedfe5ed0e331b95a283a4da6eb8b", size = 2297418, upload-time = "2026-04-17T09:11:25.929Z" },
-    { url = "https://files.pythonhosted.org/packages/36/90/93a5d21990b152da7b7507b7fddb0b935f6a0984d57ac3ec45a6e17777a2/pydantic_core-2.46.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac204542736aa295fa25f713b7fad6fc50b46ab7764d16087575c85f085174f3", size = 2093735, upload-time = "2026-04-17T09:12:06.908Z" },
-    { url = "https://files.pythonhosted.org/packages/14/22/b8b1ffdddf08b4e84380bcb67f41dbbf4c171377c1d36fc6290794bb2094/pydantic_core-2.46.2-cp313-cp313-manylinux_2_31_riscv64.whl", hash = "sha256:9a7c43a0584742dface3ca0daf6f719d46c1ac2f87cf080050f9ae052c75e1b2", size = 2127570, upload-time = "2026-04-17T09:11:53.906Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/26/e60d72b4e2d0ce1fa811044a974412ac1c567fe067d97b3e6b290530786e/pydantic_core-2.46.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fd05e1edb6a90ad446fa268ab09e59202766b837597b714b2492db11ee87fab9", size = 2183524, upload-time = "2026-04-17T09:11:30.092Z" },
-    { url = "https://files.pythonhosted.org/packages/35/32/36bec7584a1eefb17dec4dfa1c946d3fe4440f466c5705b8adfda69c9a9f/pydantic_core-2.46.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:91155b110788b5501abc7ea954f1d08606219e4e28e3c73a94124307c06efb80", size = 2185408, upload-time = "2026-04-17T09:10:57.228Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/d6/1a5689d873620efd67d6b163db0c444c056adb0849b5bc33e2b9f09665a6/pydantic_core-2.46.2-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:e4e2c72a529fa03ff228be1d2b76944013f428220b764e03cc50ada67e17a42c", size = 2335171, upload-time = "2026-04-17T09:11:43.369Z" },
-    { url = "https://files.pythonhosted.org/packages/3e/8e/675104802abe8ef502b072050ee5f2e915251aa1a3af87e1015ce31ec42d/pydantic_core-2.46.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:56291ec1a11c3499890c99a8fd9053b47e60fe837a77ec72c0671b1b8b3dce24", size = 2362743, upload-time = "2026-04-17T09:10:18.333Z" },
-    { url = "https://files.pythonhosted.org/packages/8d/bc/86c5dde4fa6e24467680eef5047da3c1a19be0a527d0d8e14aa76b39307c/pydantic_core-2.46.2-cp313-cp313-win32.whl", hash = "sha256:b50f9c5f826ddca1246f055148df939f5f3f2d0d96db73de28e2233f22210d4c", size = 1958074, upload-time = "2026-04-17T09:12:38.622Z" },
-    { url = "https://files.pythonhosted.org/packages/2a/97/2537e8c1282b2c4eb062580c0d7a4339e10b072b803d1ee0b7f1f0a5c22c/pydantic_core-2.46.2-cp313-cp313-win_amd64.whl", hash = "sha256:251a57788823230ca8cbc99e6245d1a2ed6e180ec4864f251c94182c580c7f2e", size = 2071741, upload-time = "2026-04-17T09:13:32.405Z" },
-    { url = "https://files.pythonhosted.org/packages/da/aa/2ee75798706f9dbc4e76dbe59e41a396c5c311e3d6223b9cf6a5fa7780be/pydantic_core-2.46.2-cp313-cp313-win_arm64.whl", hash = "sha256:315d32d1a71494d6b4e1e14a9fa7a4329597b4c4340088ad7e1a9dafbeed92a9", size = 2025955, upload-time = "2026-04-17T09:10:15.567Z" },
-    { url = "https://files.pythonhosted.org/packages/d0/96/a50ccb6b539ae780f73cea74905468777680e30c6c3bdf714b9d4c116ea0/pydantic_core-2.46.2-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:4f59b45f3ef8650c0c736a57f59031d47ed9df4c0a64e83796849d7d14863a2d", size = 2097111, upload-time = "2026-04-17T09:10:49.617Z" },
-    { url = "https://files.pythonhosted.org/packages/34/5f/fdead7b3afa822ab6e5a18ee0ecffd54937de1877c01ed13a342e0fb3f07/pydantic_core-2.46.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3a075a29ebef752784a91532a1a85be6b234ccffec0a9d7978a92696387c3da6", size = 1951904, upload-time = "2026-04-17T09:12:32.062Z" },
-    { url = "https://files.pythonhosted.org/packages/95/e0/1c5d547e550cdab1bec737492aa08865337af6fe7fc9b96f7f45f17d9519/pydantic_core-2.46.2-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d12d786e30c04a9d307c5d7080bf720d9bac7f1668191d8e37633a9562749e2", size = 1978667, upload-time = "2026-04-17T09:11:35.589Z" },
-    { url = "https://files.pythonhosted.org/packages/0e/cb/665ce629e218c8228302cb94beff4f6531082a2c87d3ecc3d5e63a26f392/pydantic_core-2.46.2-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:0d5e6d6343b0b5dcacb3503b5de90022968da8ed0ab9ab39d3eda71c20cbf84e", size = 2046721, upload-time = "2026-04-17T09:11:47.725Z" },
-    { url = "https://files.pythonhosted.org/packages/77/e9/6cb2cf60f54c1472bbdfce19d957553b43dbba79d1d7b2930a195c594785/pydantic_core-2.46.2-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:233eebac0999b6b9ba76eb56f3ec8fce13164aa16b6d2225a36a79e0f95b5973", size = 2228483, upload-time = "2026-04-17T09:12:08.837Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/2a/93e018dd5571f781ebaeda8c0cf65398489d5bee9b1f484df0b6149b43b9/pydantic_core-2.46.2-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9cc0eee720dd2f14f3b7c349469402b99ad81a174ab49d3533974529e9d93992", size = 2294663, upload-time = "2026-04-17T09:12:52.053Z" },
-    { url = "https://files.pythonhosted.org/packages/5e/4f/49e57ca55c770c93d9bb046666a54949b42e3c9099a0c5fe94557873fe30/pydantic_core-2.46.2-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83ee76bf2c9910513dbc19e7d82367131fa7508dedd6186a462393071cc11059", size = 2098742, upload-time = "2026-04-17T09:13:45.472Z" },
-    { url = "https://files.pythonhosted.org/packages/c6/b0/6e46b5cd3332af665f794b8cdeea206618a8630bd9e7bcc36864518fce81/pydantic_core-2.46.2-cp314-cp314-manylinux_2_31_riscv64.whl", hash = "sha256:d61db38eb4ee5192f0c261b7f2d38e420b554df8912245e3546aee5c45e2fd78", size = 2125922, upload-time = "2026-04-17T09:12:54.304Z" },
-    { url = "https://files.pythonhosted.org/packages/06/d1/40850c81585be443a2abfdf7f795f8fae831baf8e2f9b2133c8246ac671c/pydantic_core-2.46.2-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8f09a713d17bcd55da8ab02ebd9110c5246a49c44182af213b5212800af8bc83", size = 2183000, upload-time = "2026-04-17T09:10:59.027Z" },
-    { url = "https://files.pythonhosted.org/packages/04/af/8493d7dfa03ebb7866909e577c6aa65ea0de7377b86023cc51d0c8e11db3/pydantic_core-2.46.2-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:30cacc5fb696e64b8ef6fd31d9549d394dd7d52760db072eecb98e37e3af1677", size = 2180335, upload-time = "2026-04-17T09:12:57.01Z" },
-    { url = "https://files.pythonhosted.org/packages/72/5b/1f6a344c4ffdf284da41c6067b82d5ebcbd11ce1b515ae4b662d4adb6f61/pydantic_core-2.46.2-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:7ccfb105fcfe91a22bbb5563ad3dc124bc1aa75bfd2e53a780ab05f78cdf6108", size = 2330002, upload-time = "2026-04-17T09:12:02.958Z" },
-    { url = "https://files.pythonhosted.org/packages/25/ff/9a694126c12d6d2f48a0cafa6f8eef88ef0d8825600e18d03ff2e896c3b2/pydantic_core-2.46.2-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:13ffef637dc8370c249e5b26bd18e9a80a4fca3d809618c44e18ec834a7ca7a8", size = 2359920, upload-time = "2026-04-17T09:10:27.764Z" },
-    { url = "https://files.pythonhosted.org/packages/51/c8/3a35c763d68a9cb2675eb10ef242cf66c5d4701b28ae12e688d67d2c180e/pydantic_core-2.46.2-cp314-cp314-win32.whl", hash = "sha256:1b0ab6d756ca2704a938e6c31b53f290c2f9c10d3914235410302a149de1a83e", size = 1953701, upload-time = "2026-04-17T09:13:30.021Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/6a/f2726a780365f7dfd89d62036f984f7acb99978c60c5e1fa7c0cb898ed11/pydantic_core-2.46.2-cp314-cp314-win_amd64.whl", hash = "sha256:99ebade8c9ada4df975372d8dd25883daa0e379a05f1cd0c99aa0c04368d01a6", size = 2071867, upload-time = "2026-04-17T09:10:39.205Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/79/76baacb9feba3d7c399b245ca1a29c74ea0db04ea693811374827eec2290/pydantic_core-2.46.2-cp314-cp314-win_arm64.whl", hash = "sha256:de87422197cf7f83db91d89c86a21660d749b3cd76cd8a45d115b8e675670f02", size = 2017252, upload-time = "2026-04-17T09:10:26.175Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/3b/77c26938f817668d9ad9bab1a905cb23f11d9a3d4bf724d429b3e55a8eaf/pydantic_core-2.46.2-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:236f22b4a206b5b61db955396b7cf9e2e1ff77f372efe9570128ccfcd6a525eb", size = 2094545, upload-time = "2026-04-17T09:12:19.339Z" },
-    { url = "https://files.pythonhosted.org/packages/fe/de/42c13f590e3c260966aa49bcdb1674774f975467c49abd51191e502bea28/pydantic_core-2.46.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c2012f64d2cd7cca50f49f22445aa5a88691ac2b4498ee0a9a977f8ca4f7289f", size = 1933953, upload-time = "2026-04-17T09:09:55.889Z" },
-    { url = "https://files.pythonhosted.org/packages/4e/84/ebe3ebb3e2d8db656937cfa6f97f544cb7132f2307a4a7dfdcd0ea102a12/pydantic_core-2.46.2-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d07d6c63106d3a9c9a333e2636f9c82c703b1a9e3b079299e58747964e4fdb72", size = 1974435, upload-time = "2026-04-17T09:10:12.371Z" },
-    { url = "https://files.pythonhosted.org/packages/b9/15/0bf51ca6709477cd4ef86148b6d7844f3308f029eac361dd0383f1e17b1a/pydantic_core-2.46.2-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c326a2b4b85e959d9a1fc3a11f32f84611b6ec07c053e1828a860edf8d068208", size = 2031113, upload-time = "2026-04-17T09:10:00.752Z" },
-    { url = "https://files.pythonhosted.org/packages/02/ae/b7b5af9b79db036d9e61a44c481c17a213dc8fc4b8b71fe6875a72fc778b/pydantic_core-2.46.2-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ac8a65e798f2462552c00d2e013d532c94d646729dda98458beaf51f9ec7b120", size = 2236325, upload-time = "2026-04-17T09:10:33.227Z" },
-    { url = "https://files.pythonhosted.org/packages/a6/ae/ecef7477b5a03d4a499708f7e75d2836452ebb70b776c2d64612b334f57a/pydantic_core-2.46.2-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a3c2bc1cc8164bedbc160b7bb1e8cc1e8b9c27f69ae4f9ae2b976cdae02b2dd", size = 2278135, upload-time = "2026-04-17T09:10:23.287Z" },
-    { url = "https://files.pythonhosted.org/packages/db/e4/2f9d82faa47af6c39fc3f120145fd915971e1e0cb6b55b494fad9fdf8275/pydantic_core-2.46.2-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e69aa5e10b7e8b1bb4a6888650fd12fcbf11d396ca11d4a44de1450875702830", size = 2109071, upload-time = "2026-04-17T09:11:06.149Z" },
-    { url = "https://files.pythonhosted.org/packages/f1/9c/677cf10873fbd0b116575ab7b97c90482b21564f8a8040beb18edef7a577/pydantic_core-2.46.2-cp314-cp314t-manylinux_2_31_riscv64.whl", hash = "sha256:4e6df5c3301e65fb42bc5338bf9a1027a02b0a31dc7f54c33775229af474daf0", size = 2106028, upload-time = "2026-04-17T09:10:51.525Z" },
-    { url = "https://files.pythonhosted.org/packages/d6/53/6a06183544daba51c059123a2064a99039df25f115a06bdb26f2ea177038/pydantic_core-2.46.2-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2c2f6e32548ac8d559b47944effcf8ae4d81c161f6b6c885edc53bc08b8f192d", size = 2164816, upload-time = "2026-04-17T09:11:56.187Z" },
-    { url = "https://files.pythonhosted.org/packages/57/6f/10fcdd9e3eca66fc828eef0f6f5850f2dd3bca2c59e6e041fb8bc3da39be/pydantic_core-2.46.2-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:b089a81c58e6ea0485562bbbbbca4f65c0549521606d5ef27fba217aac9b665a", size = 2166130, upload-time = "2026-04-17T09:10:03.804Z" },
-    { url = "https://files.pythonhosted.org/packages/29/83/92d3fd0e0156cad2e3cb5c26de73794af78ac9fa0c22ab666e566dd67061/pydantic_core-2.46.2-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:7f700a6d6f64112ae9193709b84303bbab84424ad4b47d0253301aabce9dfc70", size = 2316605, upload-time = "2026-04-17T09:12:45.249Z" },
-    { url = "https://files.pythonhosted.org/packages/97/f1/facffdb970981068219582e499b8d0871ed163ffcc6b347de5c412669e4c/pydantic_core-2.46.2-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:67db6814beaa5fefe91101ec7eb9efda613795767be96f7cf58b1ca8c9ca9972", size = 2358385, upload-time = "2026-04-17T09:09:54.657Z" },
-    { url = "https://files.pythonhosted.org/packages/8b/a1/b8160b2f22b2199467bc68581a4ed380643c16b348a27d6165c6c242d694/pydantic_core-2.46.2-cp314-cp314t-win32.whl", hash = "sha256:32fbc7447be8e3be99bf7869f7066308f16be55b61f9882c2cefc7931f5c7664", size = 1942373, upload-time = "2026-04-17T09:12:59.594Z" },
-    { url = "https://files.pythonhosted.org/packages/0d/90/db89acabe5b150e11d1b59fe3d947dda2ef6abbfef5c82f056ff63802f5d/pydantic_core-2.46.2-cp314-cp314t-win_amd64.whl", hash = "sha256:b317a2b97019c0b95ce99f4f901ae383f40132da6706cdf1731066a73394c25c", size = 2052078, upload-time = "2026-04-17T09:10:19.96Z" },
-    { url = "https://files.pythonhosted.org/packages/97/32/e19b83ceb07a3f1bb21798407790bbc9a31740158fd132b94139cb84e16c/pydantic_core-2.46.2-cp314-cp314t-win_arm64.whl", hash = "sha256:7dcb9d40930dfad7ab6b20bcc6ca9d2b030b0f347a0cd9909b54bd53ead521b1", size = 2016941, upload-time = "2026-04-17T09:12:34.447Z" },
-    { url = "https://files.pythonhosted.org/packages/f3/d2/66c146f421178641bda880b0267c0d57dd84f5fec9ecc8e46be17b480742/pydantic_core-2.46.2-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:e9fcabd1857492b5bf16f90258babde50f618f55d046b1309972da2396321ff9", size = 2091621, upload-time = "2026-04-17T09:12:47.501Z" },
-    { url = "https://files.pythonhosted.org/packages/ee/b2/c28419aa9fc8055f4ac8e801d1d11c6357351bfa4321ed9bafab3eb98087/pydantic_core-2.46.2-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:fb3ec2c7f54c07b30d89983ce78dc32c37dd06a972448b8716d609493802d628", size = 1937059, upload-time = "2026-04-17T09:10:53.554Z" },
-    { url = "https://files.pythonhosted.org/packages/30/ce/cd0824a2db213dc17113291b7a09b9b0ccd9fbf97daa4b81548703341baf/pydantic_core-2.46.2-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:130a6c837d819ef33e8c2bf702ed2c3429237ea69807f1140943d6f4bdaf52fa", size = 1997278, upload-time = "2026-04-17T09:12:23.784Z" },
-    { url = "https://files.pythonhosted.org/packages/c9/69/47283fe3c0c967d3e9e9cd6c42b70907610c8a6f8d6e8381f1bb55f8006c/pydantic_core-2.46.2-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c2e25417cec5cd9bddb151e33cb08c50160f317479ecc02b22a95ec18f8fe004", size = 2147096, upload-time = "2026-04-17T09:12:43.124Z" },
+sdist = { url = "https://files.pythonhosted.org/packages/71/70/23b021c950c2addd24ec408e9ab05d59b035b39d97cdc1130e1bce647bb6/pydantic_core-2.41.5.tar.gz", hash = "sha256:08daa51ea16ad373ffd5e7606252cc32f07bc72b28284b6bc9c6df804816476e", size = 460952, upload-time = "2025-11-04T13:43:49.098Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/90/32c9941e728d564b411d574d8ee0cf09b12ec978cb22b294995bae5549a5/pydantic_core-2.41.5-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:77b63866ca88d804225eaa4af3e664c5faf3568cea95360d21f4725ab6e07146", size = 2107298, upload-time = "2025-11-04T13:39:04.116Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/a8/61c96a77fe28993d9a6fb0f4127e05430a267b235a124545d79fea46dd65/pydantic_core-2.41.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dfa8a0c812ac681395907e71e1274819dec685fec28273a28905df579ef137e2", size = 1901475, upload-time = "2025-11-04T13:39:06.055Z" },
+    { url = "https://files.pythonhosted.org/packages/5d/b6/338abf60225acc18cdc08b4faef592d0310923d19a87fba1faf05af5346e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5921a4d3ca3aee735d9fd163808f5e8dd6c6972101e4adbda9a4667908849b97", size = 1918815, upload-time = "2025-11-04T13:39:10.41Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/1c/2ed0433e682983d8e8cba9c8d8ef274d4791ec6a6f24c58935b90e780e0a/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e25c479382d26a2a41b7ebea1043564a937db462816ea07afa8a44c0866d52f9", size = 2065567, upload-time = "2025-11-04T13:39:12.244Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/24/cf84974ee7d6eae06b9e63289b7b8f6549d416b5c199ca2d7ce13bbcf619/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f547144f2966e1e16ae626d8ce72b4cfa0caedc7fa28052001c94fb2fcaa1c52", size = 2230442, upload-time = "2025-11-04T13:39:13.962Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/21/4e287865504b3edc0136c89c9c09431be326168b1eb7841911cbc877a995/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f52298fbd394f9ed112d56f3d11aabd0d5bd27beb3084cc3d8ad069483b8941", size = 2350956, upload-time = "2025-11-04T13:39:15.889Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/76/7727ef2ffa4b62fcab916686a68a0426b9b790139720e1934e8ba797e238/pydantic_core-2.41.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:100baa204bb412b74fe285fb0f3a385256dad1d1879f0a5cb1499ed2e83d132a", size = 2068253, upload-time = "2025-11-04T13:39:17.403Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/8c/a4abfc79604bcb4c748e18975c44f94f756f08fb04218d5cb87eb0d3a63e/pydantic_core-2.41.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:05a2c8852530ad2812cb7914dc61a1125dc4e06252ee98e5638a12da6cc6fb6c", size = 2177050, upload-time = "2025-11-04T13:39:19.351Z" },
+    { url = "https://files.pythonhosted.org/packages/67/b1/de2e9a9a79b480f9cb0b6e8b6ba4c50b18d4e89852426364c66aa82bb7b3/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:29452c56df2ed968d18d7e21f4ab0ac55e71dc59524872f6fc57dcf4a3249ed2", size = 2147178, upload-time = "2025-11-04T13:39:21Z" },
+    { url = "https://files.pythonhosted.org/packages/16/c1/dfb33f837a47b20417500efaa0378adc6635b3c79e8369ff7a03c494b4ac/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_armv7l.whl", hash = "sha256:d5160812ea7a8a2ffbe233d8da666880cad0cbaf5d4de74ae15c313213d62556", size = 2341833, upload-time = "2025-11-04T13:39:22.606Z" },
+    { url = "https://files.pythonhosted.org/packages/47/36/00f398642a0f4b815a9a558c4f1dca1b4020a7d49562807d7bc9ff279a6c/pydantic_core-2.41.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:df3959765b553b9440adfd3c795617c352154e497a4eaf3752555cfb5da8fc49", size = 2321156, upload-time = "2025-11-04T13:39:25.843Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/70/cad3acd89fde2010807354d978725ae111ddf6d0ea46d1ea1775b5c1bd0c/pydantic_core-2.41.5-cp310-cp310-win32.whl", hash = "sha256:1f8d33a7f4d5a7889e60dc39856d76d09333d8a6ed0f5f1190635cbec70ec4ba", size = 1989378, upload-time = "2025-11-04T13:39:27.92Z" },
+    { url = "https://files.pythonhosted.org/packages/76/92/d338652464c6c367e5608e4488201702cd1cbb0f33f7b6a85a60fe5f3720/pydantic_core-2.41.5-cp310-cp310-win_amd64.whl", hash = "sha256:62de39db01b8d593e45871af2af9e497295db8d73b085f6bfd0b18c83c70a8f9", size = 2013622, upload-time = "2025-11-04T13:39:29.848Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/72/74a989dd9f2084b3d9530b0915fdda64ac48831c30dbf7c72a41a5232db8/pydantic_core-2.41.5-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a3a52f6156e73e7ccb0f8cced536adccb7042be67cb45f9562e12b319c119da6", size = 2105873, upload-time = "2025-11-04T13:39:31.373Z" },
+    { url = "https://files.pythonhosted.org/packages/12/44/37e403fd9455708b3b942949e1d7febc02167662bf1a7da5b78ee1ea2842/pydantic_core-2.41.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7f3bf998340c6d4b0c9a2f02d6a400e51f123b59565d74dc60d252ce888c260b", size = 1899826, upload-time = "2025-11-04T13:39:32.897Z" },
+    { url = "https://files.pythonhosted.org/packages/33/7f/1d5cab3ccf44c1935a359d51a8a2a9e1a654b744b5e7f80d41b88d501eec/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:378bec5c66998815d224c9ca994f1e14c0c21cb95d2f52b6021cc0b2a58f2a5a", size = 1917869, upload-time = "2025-11-04T13:39:34.469Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/6a/30d94a9674a7fe4f4744052ed6c5e083424510be1e93da5bc47569d11810/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e7b576130c69225432866fe2f4a469a85a54ade141d96fd396dffcf607b558f8", size = 2063890, upload-time = "2025-11-04T13:39:36.053Z" },
+    { url = "https://files.pythonhosted.org/packages/50/be/76e5d46203fcb2750e542f32e6c371ffa9b8ad17364cf94bb0818dbfb50c/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6cb58b9c66f7e4179a2d5e0f849c48eff5c1fca560994d6eb6543abf955a149e", size = 2229740, upload-time = "2025-11-04T13:39:37.753Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/ee/fed784df0144793489f87db310a6bbf8118d7b630ed07aa180d6067e653a/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:88942d3a3dff3afc8288c21e565e476fc278902ae4d6d134f1eeda118cc830b1", size = 2350021, upload-time = "2025-11-04T13:39:40.94Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/be/8fed28dd0a180dca19e72c233cbf58efa36df055e5b9d90d64fd1740b828/pydantic_core-2.41.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f31d95a179f8d64d90f6831d71fa93290893a33148d890ba15de25642c5d075b", size = 2066378, upload-time = "2025-11-04T13:39:42.523Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/3b/698cf8ae1d536a010e05121b4958b1257f0b5522085e335360e53a6b1c8b/pydantic_core-2.41.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:c1df3d34aced70add6f867a8cf413e299177e0c22660cc767218373d0779487b", size = 2175761, upload-time = "2025-11-04T13:39:44.553Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/ba/15d537423939553116dea94ce02f9c31be0fa9d0b806d427e0308ec17145/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:4009935984bd36bd2c774e13f9a09563ce8de4abaa7226f5108262fa3e637284", size = 2146303, upload-time = "2025-11-04T13:39:46.238Z" },
+    { url = "https://files.pythonhosted.org/packages/58/7f/0de669bf37d206723795f9c90c82966726a2ab06c336deba4735b55af431/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:34a64bc3441dc1213096a20fe27e8e128bd3ff89921706e83c0b1ac971276594", size = 2340355, upload-time = "2025-11-04T13:39:48.002Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/de/e7482c435b83d7e3c3ee5ee4451f6e8973cff0eb6007d2872ce6383f6398/pydantic_core-2.41.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c9e19dd6e28fdcaa5a1de679aec4141f691023916427ef9bae8584f9c2fb3b0e", size = 2319875, upload-time = "2025-11-04T13:39:49.705Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/e6/8c9e81bb6dd7560e33b9053351c29f30c8194b72f2d6932888581f503482/pydantic_core-2.41.5-cp311-cp311-win32.whl", hash = "sha256:2c010c6ded393148374c0f6f0bf89d206bf3217f201faa0635dcd56bd1520f6b", size = 1987549, upload-time = "2025-11-04T13:39:51.842Z" },
+    { url = "https://files.pythonhosted.org/packages/11/66/f14d1d978ea94d1bc21fc98fcf570f9542fe55bfcc40269d4e1a21c19bf7/pydantic_core-2.41.5-cp311-cp311-win_amd64.whl", hash = "sha256:76ee27c6e9c7f16f47db7a94157112a2f3a00e958bc626e2f4ee8bec5c328fbe", size = 2011305, upload-time = "2025-11-04T13:39:53.485Z" },
+    { url = "https://files.pythonhosted.org/packages/56/d8/0e271434e8efd03186c5386671328154ee349ff0354d83c74f5caaf096ed/pydantic_core-2.41.5-cp311-cp311-win_arm64.whl", hash = "sha256:4bc36bbc0b7584de96561184ad7f012478987882ebf9f9c389b23f432ea3d90f", size = 1972902, upload-time = "2025-11-04T13:39:56.488Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/5d/5f6c63eebb5afee93bcaae4ce9a898f3373ca23df3ccaef086d0233a35a7/pydantic_core-2.41.5-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f41a7489d32336dbf2199c8c0a215390a751c5b014c2c1c5366e817202e9cdf7", size = 2110990, upload-time = "2025-11-04T13:39:58.079Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/32/9c2e8ccb57c01111e0fd091f236c7b371c1bccea0fa85247ac55b1e2b6b6/pydantic_core-2.41.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:070259a8818988b9a84a449a2a7337c7f430a22acc0859c6b110aa7212a6d9c0", size = 1896003, upload-time = "2025-11-04T13:39:59.956Z" },
+    { url = "https://files.pythonhosted.org/packages/68/b8/a01b53cb0e59139fbc9e4fda3e9724ede8de279097179be4ff31f1abb65a/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e96cea19e34778f8d59fe40775a7a574d95816eb150850a85a7a4c8f4b94ac69", size = 1919200, upload-time = "2025-11-04T13:40:02.241Z" },
+    { url = "https://files.pythonhosted.org/packages/38/de/8c36b5198a29bdaade07b5985e80a233a5ac27137846f3bc2d3b40a47360/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ed2e99c456e3fadd05c991f8f437ef902e00eedf34320ba2b0842bd1c3ca3a75", size = 2052578, upload-time = "2025-11-04T13:40:04.401Z" },
+    { url = "https://files.pythonhosted.org/packages/00/b5/0e8e4b5b081eac6cb3dbb7e60a65907549a1ce035a724368c330112adfdd/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:65840751b72fbfd82c3c640cff9284545342a4f1eb1586ad0636955b261b0b05", size = 2208504, upload-time = "2025-11-04T13:40:06.072Z" },
+    { url = "https://files.pythonhosted.org/packages/77/56/87a61aad59c7c5b9dc8caad5a41a5545cba3810c3e828708b3d7404f6cef/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e536c98a7626a98feb2d3eaf75944ef6f3dbee447e1f841eae16f2f0a72d8ddc", size = 2335816, upload-time = "2025-11-04T13:40:07.835Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/76/941cc9f73529988688a665a5c0ecff1112b3d95ab48f81db5f7606f522d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eceb81a8d74f9267ef4081e246ffd6d129da5d87e37a77c9bde550cb04870c1c", size = 2075366, upload-time = "2025-11-04T13:40:09.804Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/43/ebef01f69baa07a482844faaa0a591bad1ef129253ffd0cdaa9d8a7f72d3/pydantic_core-2.41.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d38548150c39b74aeeb0ce8ee1d8e82696f4a4e16ddc6de7b1d8823f7de4b9b5", size = 2171698, upload-time = "2025-11-04T13:40:12.004Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/87/41f3202e4193e3bacfc2c065fab7706ebe81af46a83d3e27605029c1f5a6/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c23e27686783f60290e36827f9c626e63154b82b116d7fe9adba1fda36da706c", size = 2132603, upload-time = "2025-11-04T13:40:13.868Z" },
+    { url = "https://files.pythonhosted.org/packages/49/7d/4c00df99cb12070b6bccdef4a195255e6020a550d572768d92cc54dba91a/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:482c982f814460eabe1d3bb0adfdc583387bd4691ef00b90575ca0d2b6fe2294", size = 2329591, upload-time = "2025-11-04T13:40:15.672Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/6a/ebf4b1d65d458f3cda6a7335d141305dfa19bdc61140a884d165a8a1bbc7/pydantic_core-2.41.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:bfea2a5f0b4d8d43adf9d7b8bf019fb46fdd10a2e5cde477fbcb9d1fa08c68e1", size = 2319068, upload-time = "2025-11-04T13:40:17.532Z" },
+    { url = "https://files.pythonhosted.org/packages/49/3b/774f2b5cd4192d5ab75870ce4381fd89cf218af999515baf07e7206753f0/pydantic_core-2.41.5-cp312-cp312-win32.whl", hash = "sha256:b74557b16e390ec12dca509bce9264c3bbd128f8a2c376eaa68003d7f327276d", size = 1985908, upload-time = "2025-11-04T13:40:19.309Z" },
+    { url = "https://files.pythonhosted.org/packages/86/45/00173a033c801cacf67c190fef088789394feaf88a98a7035b0e40d53dc9/pydantic_core-2.41.5-cp312-cp312-win_amd64.whl", hash = "sha256:1962293292865bca8e54702b08a4f26da73adc83dd1fcf26fbc875b35d81c815", size = 2020145, upload-time = "2025-11-04T13:40:21.548Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/22/91fbc821fa6d261b376a3f73809f907cec5ca6025642c463d3488aad22fb/pydantic_core-2.41.5-cp312-cp312-win_arm64.whl", hash = "sha256:1746d4a3d9a794cacae06a5eaaccb4b8643a131d45fbc9af23e353dc0a5ba5c3", size = 1976179, upload-time = "2025-11-04T13:40:23.393Z" },
+    { url = "https://files.pythonhosted.org/packages/87/06/8806241ff1f70d9939f9af039c6c35f2360cf16e93c2ca76f184e76b1564/pydantic_core-2.41.5-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:941103c9be18ac8daf7b7adca8228f8ed6bb7a1849020f643b3a14d15b1924d9", size = 2120403, upload-time = "2025-11-04T13:40:25.248Z" },
+    { url = "https://files.pythonhosted.org/packages/94/02/abfa0e0bda67faa65fef1c84971c7e45928e108fe24333c81f3bfe35d5f5/pydantic_core-2.41.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:112e305c3314f40c93998e567879e887a3160bb8689ef3d2c04b6cc62c33ac34", size = 1896206, upload-time = "2025-11-04T13:40:27.099Z" },
+    { url = "https://files.pythonhosted.org/packages/15/df/a4c740c0943e93e6500f9eb23f4ca7ec9bf71b19e608ae5b579678c8d02f/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cbaad15cb0c90aa221d43c00e77bb33c93e8d36e0bf74760cd00e732d10a6a0", size = 1919307, upload-time = "2025-11-04T13:40:29.806Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/e3/6324802931ae1d123528988e0e86587c2072ac2e5394b4bc2bc34b61ff6e/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:03ca43e12fab6023fc79d28ca6b39b05f794ad08ec2feccc59a339b02f2b3d33", size = 2063258, upload-time = "2025-11-04T13:40:33.544Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/d4/2230d7151d4957dd79c3044ea26346c148c98fbf0ee6ebd41056f2d62ab5/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:dc799088c08fa04e43144b164feb0c13f9a0bc40503f8df3e9fde58a3c0c101e", size = 2214917, upload-time = "2025-11-04T13:40:35.479Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/9f/eaac5df17a3672fef0081b6c1bb0b82b33ee89aa5cec0d7b05f52fd4a1fa/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97aeba56665b4c3235a0e52b2c2f5ae9cd071b8a8310ad27bddb3f7fb30e9aa2", size = 2332186, upload-time = "2025-11-04T13:40:37.436Z" },
+    { url = "https://files.pythonhosted.org/packages/cf/4e/35a80cae583a37cf15604b44240e45c05e04e86f9cfd766623149297e971/pydantic_core-2.41.5-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:406bf18d345822d6c21366031003612b9c77b3e29ffdb0f612367352aab7d586", size = 2073164, upload-time = "2025-11-04T13:40:40.289Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/e3/f6e262673c6140dd3305d144d032f7bd5f7497d3871c1428521f19f9efa2/pydantic_core-2.41.5-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b93590ae81f7010dbe380cdeab6f515902ebcbefe0b9327cc4804d74e93ae69d", size = 2179146, upload-time = "2025-11-04T13:40:42.809Z" },
+    { url = "https://files.pythonhosted.org/packages/75/c7/20bd7fc05f0c6ea2056a4565c6f36f8968c0924f19b7d97bbfea55780e73/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:01a3d0ab748ee531f4ea6c3e48ad9dac84ddba4b0d82291f87248f2f9de8d740", size = 2137788, upload-time = "2025-11-04T13:40:44.752Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/8d/34318ef985c45196e004bc46c6eab2eda437e744c124ef0dbe1ff2c9d06b/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:6561e94ba9dacc9c61bce40e2d6bdc3bfaa0259d3ff36ace3b1e6901936d2e3e", size = 2340133, upload-time = "2025-11-04T13:40:46.66Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/59/013626bf8c78a5a5d9350d12e7697d3d4de951a75565496abd40ccd46bee/pydantic_core-2.41.5-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:915c3d10f81bec3a74fbd4faebe8391013ba61e5a1a8d48c4455b923bdda7858", size = 2324852, upload-time = "2025-11-04T13:40:48.575Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/d9/c248c103856f807ef70c18a4f986693a46a8ffe1602e5d361485da502d20/pydantic_core-2.41.5-cp313-cp313-win32.whl", hash = "sha256:650ae77860b45cfa6e2cdafc42618ceafab3a2d9a3811fcfbd3bbf8ac3c40d36", size = 1994679, upload-time = "2025-11-04T13:40:50.619Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/8b/341991b158ddab181cff136acd2552c9f35bd30380422a639c0671e99a91/pydantic_core-2.41.5-cp313-cp313-win_amd64.whl", hash = "sha256:79ec52ec461e99e13791ec6508c722742ad745571f234ea6255bed38c6480f11", size = 2019766, upload-time = "2025-11-04T13:40:52.631Z" },
+    { url = "https://files.pythonhosted.org/packages/73/7d/f2f9db34af103bea3e09735bb40b021788a5e834c81eedb541991badf8f5/pydantic_core-2.41.5-cp313-cp313-win_arm64.whl", hash = "sha256:3f84d5c1b4ab906093bdc1ff10484838aca54ef08de4afa9de0f5f14d69639cd", size = 1981005, upload-time = "2025-11-04T13:40:54.734Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/28/46b7c5c9635ae96ea0fbb779e271a38129df2550f763937659ee6c5dbc65/pydantic_core-2.41.5-cp314-cp314-macosx_10_12_x86_64.whl", hash = "sha256:3f37a19d7ebcdd20b96485056ba9e8b304e27d9904d233d7b1015db320e51f0a", size = 2119622, upload-time = "2025-11-04T13:40:56.68Z" },
+    { url = "https://files.pythonhosted.org/packages/74/1a/145646e5687e8d9a1e8d09acb278c8535ebe9e972e1f162ed338a622f193/pydantic_core-2.41.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:1d1d9764366c73f996edd17abb6d9d7649a7eb690006ab6adbda117717099b14", size = 1891725, upload-time = "2025-11-04T13:40:58.807Z" },
+    { url = "https://files.pythonhosted.org/packages/23/04/e89c29e267b8060b40dca97bfc64a19b2a3cf99018167ea1677d96368273/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25e1c2af0fce638d5f1988b686f3b3ea8cd7de5f244ca147c777769e798a9cd1", size = 1915040, upload-time = "2025-11-04T13:41:00.853Z" },
+    { url = "https://files.pythonhosted.org/packages/84/a3/15a82ac7bd97992a82257f777b3583d3e84bdb06ba6858f745daa2ec8a85/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:506d766a8727beef16b7adaeb8ee6217c64fc813646b424d0804d67c16eddb66", size = 2063691, upload-time = "2025-11-04T13:41:03.504Z" },
+    { url = "https://files.pythonhosted.org/packages/74/9b/0046701313c6ef08c0c1cf0e028c67c770a4e1275ca73131563c5f2a310a/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4819fa52133c9aa3c387b3328f25c1facc356491e6135b459f1de698ff64d869", size = 2213897, upload-time = "2025-11-04T13:41:05.804Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/cd/6bac76ecd1b27e75a95ca3a9a559c643b3afcd2dd62086d4b7a32a18b169/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2b761d210c9ea91feda40d25b4efe82a1707da2ef62901466a42492c028553a2", size = 2333302, upload-time = "2025-11-04T13:41:07.809Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/d2/ef2074dc020dd6e109611a8be4449b98cd25e1b9b8a303c2f0fca2f2bcf7/pydantic_core-2.41.5-cp314-cp314-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:22f0fb8c1c583a3b6f24df2470833b40207e907b90c928cc8d3594b76f874375", size = 2064877, upload-time = "2025-11-04T13:41:09.827Z" },
+    { url = "https://files.pythonhosted.org/packages/18/66/e9db17a9a763d72f03de903883c057b2592c09509ccfe468187f2a2eef29/pydantic_core-2.41.5-cp314-cp314-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2782c870e99878c634505236d81e5443092fba820f0373997ff75f90f68cd553", size = 2180680, upload-time = "2025-11-04T13:41:12.379Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/9e/3ce66cebb929f3ced22be85d4c2399b8e85b622db77dad36b73c5387f8f8/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_aarch64.whl", hash = "sha256:0177272f88ab8312479336e1d777f6b124537d47f2123f89cb37e0accea97f90", size = 2138960, upload-time = "2025-11-04T13:41:14.627Z" },
+    { url = "https://files.pythonhosted.org/packages/a6/62/205a998f4327d2079326b01abee48e502ea739d174f0a89295c481a2272e/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_armv7l.whl", hash = "sha256:63510af5e38f8955b8ee5687740d6ebf7c2a0886d15a6d65c32814613681bc07", size = 2339102, upload-time = "2025-11-04T13:41:16.868Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/0d/f05e79471e889d74d3d88f5bd20d0ed189ad94c2423d81ff8d0000aab4ff/pydantic_core-2.41.5-cp314-cp314-musllinux_1_1_x86_64.whl", hash = "sha256:e56ba91f47764cc14f1daacd723e3e82d1a89d783f0f5afe9c364b8bb491ccdb", size = 2326039, upload-time = "2025-11-04T13:41:18.934Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/e1/e08a6208bb100da7e0c4b288eed624a703f4d129bde2da475721a80cab32/pydantic_core-2.41.5-cp314-cp314-win32.whl", hash = "sha256:aec5cf2fd867b4ff45b9959f8b20ea3993fc93e63c7363fe6851424c8a7e7c23", size = 1995126, upload-time = "2025-11-04T13:41:21.418Z" },
+    { url = "https://files.pythonhosted.org/packages/48/5d/56ba7b24e9557f99c9237e29f5c09913c81eeb2f3217e40e922353668092/pydantic_core-2.41.5-cp314-cp314-win_amd64.whl", hash = "sha256:8e7c86f27c585ef37c35e56a96363ab8de4e549a95512445b85c96d3e2f7c1bf", size = 2015489, upload-time = "2025-11-04T13:41:24.076Z" },
+    { url = "https://files.pythonhosted.org/packages/4e/bb/f7a190991ec9e3e0ba22e4993d8755bbc4a32925c0b5b42775c03e8148f9/pydantic_core-2.41.5-cp314-cp314-win_arm64.whl", hash = "sha256:e672ba74fbc2dc8eea59fb6d4aed6845e6905fc2a8afe93175d94a83ba2a01a0", size = 1977288, upload-time = "2025-11-04T13:41:26.33Z" },
+    { url = "https://files.pythonhosted.org/packages/92/ed/77542d0c51538e32e15afe7899d79efce4b81eee631d99850edc2f5e9349/pydantic_core-2.41.5-cp314-cp314t-macosx_10_12_x86_64.whl", hash = "sha256:8566def80554c3faa0e65ac30ab0932b9e3a5cd7f8323764303d468e5c37595a", size = 2120255, upload-time = "2025-11-04T13:41:28.569Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/3d/6913dde84d5be21e284439676168b28d8bbba5600d838b9dca99de0fad71/pydantic_core-2.41.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:b80aa5095cd3109962a298ce14110ae16b8c1aece8b72f9dafe81cf597ad80b3", size = 1863760, upload-time = "2025-11-04T13:41:31.055Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/f0/e5e6b99d4191da102f2b0eb9687aaa7f5bea5d9964071a84effc3e40f997/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3006c3dd9ba34b0c094c544c6006cc79e87d8612999f1a5d43b769b89181f23c", size = 1878092, upload-time = "2025-11-04T13:41:33.21Z" },
+    { url = "https://files.pythonhosted.org/packages/71/48/36fb760642d568925953bcc8116455513d6e34c4beaa37544118c36aba6d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:72f6c8b11857a856bcfa48c86f5368439f74453563f951e473514579d44aa612", size = 2053385, upload-time = "2025-11-04T13:41:35.508Z" },
+    { url = "https://files.pythonhosted.org/packages/20/25/92dc684dd8eb75a234bc1c764b4210cf2646479d54b47bf46061657292a8/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5cb1b2f9742240e4bb26b652a5aeb840aa4b417c7748b6f8387927bc6e45e40d", size = 2218832, upload-time = "2025-11-04T13:41:37.732Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/09/f53e0b05023d3e30357d82eb35835d0f6340ca344720a4599cd663dca599/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bd3d54f38609ff308209bd43acea66061494157703364ae40c951f83ba99a1a9", size = 2327585, upload-time = "2025-11-04T13:41:40Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/4e/2ae1aa85d6af35a39b236b1b1641de73f5a6ac4d5a7509f77b814885760c/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ff4321e56e879ee8d2a879501c8e469414d948f4aba74a2d4593184eb326660", size = 2041078, upload-time = "2025-11-04T13:41:42.323Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/13/2e215f17f0ef326fc72afe94776edb77525142c693767fc347ed6288728d/pydantic_core-2.41.5-cp314-cp314t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d0d2568a8c11bf8225044aa94409e21da0cb09dcdafe9ecd10250b2baad531a9", size = 2173914, upload-time = "2025-11-04T13:41:45.221Z" },
+    { url = "https://files.pythonhosted.org/packages/02/7a/f999a6dcbcd0e5660bc348a3991c8915ce6599f4f2c6ac22f01d7a10816c/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:a39455728aabd58ceabb03c90e12f71fd30fa69615760a075b9fec596456ccc3", size = 2129560, upload-time = "2025-11-04T13:41:47.474Z" },
+    { url = "https://files.pythonhosted.org/packages/3a/b1/6c990ac65e3b4c079a4fb9f5b05f5b013afa0f4ed6780a3dd236d2cbdc64/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_armv7l.whl", hash = "sha256:239edca560d05757817c13dc17c50766136d21f7cd0fac50295499ae24f90fdf", size = 2329244, upload-time = "2025-11-04T13:41:49.992Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/02/3c562f3a51afd4d88fff8dffb1771b30cfdfd79befd9883ee094f5b6c0d8/pydantic_core-2.41.5-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:2a5e06546e19f24c6a96a129142a75cee553cc018ffee48a460059b1185f4470", size = 2331955, upload-time = "2025-11-04T13:41:54.079Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/96/5fb7d8c3c17bc8c62fdb031c47d77a1af698f1d7a406b0f79aaa1338f9ad/pydantic_core-2.41.5-cp314-cp314t-win32.whl", hash = "sha256:b4ececa40ac28afa90871c2cc2b9ffd2ff0bf749380fbdf57d165fd23da353aa", size = 1988906, upload-time = "2025-11-04T13:41:56.606Z" },
+    { url = "https://files.pythonhosted.org/packages/22/ed/182129d83032702912c2e2d8bbe33c036f342cc735737064668585dac28f/pydantic_core-2.41.5-cp314-cp314t-win_amd64.whl", hash = "sha256:80aa89cad80b32a912a65332f64a4450ed00966111b6615ca6816153d3585a8c", size = 1981607, upload-time = "2025-11-04T13:41:58.889Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/ed/068e41660b832bb0b1aa5b58011dea2a3fe0ba7861ff38c4d4904c1c1a99/pydantic_core-2.41.5-cp314-cp314t-win_arm64.whl", hash = "sha256:35b44f37a3199f771c3eaa53051bc8a70cd7b54f333531c59e29fd4db5d15008", size = 1974769, upload-time = "2025-11-04T13:42:01.186Z" },
+    { url = "https://files.pythonhosted.org/packages/11/72/90fda5ee3b97e51c494938a4a44c3a35a9c96c19bba12372fb9c634d6f57/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_10_12_x86_64.whl", hash = "sha256:b96d5f26b05d03cc60f11a7761a5ded1741da411e7fe0909e27a5e6a0cb7b034", size = 2115441, upload-time = "2025-11-04T13:42:39.557Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/53/8942f884fa33f50794f119012dc6a1a02ac43a56407adaac20463df8e98f/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-macosx_11_0_arm64.whl", hash = "sha256:634e8609e89ceecea15e2d61bc9ac3718caaaa71963717bf3c8f38bfde64242c", size = 1930291, upload-time = "2025-11-04T13:42:42.169Z" },
+    { url = "https://files.pythonhosted.org/packages/79/c8/ecb9ed9cd942bce09fc888ee960b52654fbdbede4ba6c2d6e0d3b1d8b49c/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93e8740d7503eb008aa2df04d3b9735f845d43ae845e6dcd2be0b55a2da43cd2", size = 1948632, upload-time = "2025-11-04T13:42:44.564Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/1b/687711069de7efa6af934e74f601e2a4307365e8fdc404703afc453eab26/pydantic_core-2.41.5-graalpy311-graalpy242_311_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f15489ba13d61f670dcc96772e733aad1a6f9c429cc27574c6cdaed82d0146ad", size = 2138905, upload-time = "2025-11-04T13:42:47.156Z" },
+    { url = "https://files.pythonhosted.org/packages/09/32/59b0c7e63e277fa7911c2fc70ccfb45ce4b98991e7ef37110663437005af/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_10_12_x86_64.whl", hash = "sha256:7da7087d756b19037bc2c06edc6c170eeef3c3bafcb8f532ff17d64dc427adfd", size = 2110495, upload-time = "2025-11-04T13:42:49.689Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/81/05e400037eaf55ad400bcd318c05bb345b57e708887f07ddb2d20e3f0e98/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-macosx_11_0_arm64.whl", hash = "sha256:aabf5777b5c8ca26f7824cb4a120a740c9588ed58df9b2d196ce92fba42ff8dc", size = 1915388, upload-time = "2025-11-04T13:42:52.215Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/0d/e3549b2399f71d56476b77dbf3cf8937cec5cd70536bdc0e374a421d0599/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c007fe8a43d43b3969e8469004e9845944f1a80e6acd47c150856bb87f230c56", size = 1942879, upload-time = "2025-11-04T13:42:56.483Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/07/34573da085946b6a313d7c42f82f16e8920bfd730665de2d11c0c37a74b5/pydantic_core-2.41.5-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:76d0819de158cd855d1cbb8fcafdf6f5cf1eb8e470abe056d5d161106e38062b", size = 2139017, upload-time = "2025-11-04T13:42:59.471Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/b0/1a2aa41e3b5a4ba11420aba2d091b2d17959c8d1519ece3627c371951e73/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b5819cd790dbf0c5eb9f82c73c16b39a65dd6dd4d1439dcdea7816ec9adddab8", size = 2103351, upload-time = "2025-11-04T13:43:02.058Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/ee/31b1f0020baaf6d091c87900ae05c6aeae101fa4e188e1613c80e4f1ea31/pydantic_core-2.41.5-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:5a4e67afbc95fa5c34cf27d9089bca7fcab4e51e57278d710320a70b956d1b9a", size = 1925363, upload-time = "2025-11-04T13:43:05.159Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/89/ab8e86208467e467a80deaca4e434adac37b10a9d134cd2f99b28a01e483/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ece5c59f0ce7d001e017643d8d24da587ea1f74f6993467d85ae8a5ef9d4f42b", size = 2135615, upload-time = "2025-11-04T13:43:08.116Z" },
+    { url = "https://files.pythonhosted.org/packages/99/0a/99a53d06dd0348b2008f2f30884b34719c323f16c3be4e6cc1203b74a91d/pydantic_core-2.41.5-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:16f80f7abe3351f8ea6858914ddc8c77e02578544a0ebc15b4c2e1a0e813b0b2", size = 2175369, upload-time = "2025-11-04T13:43:12.49Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/94/30ca3b73c6d485b9bb0bc66e611cff4a7138ff9736b7e66bcf0852151636/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:33cb885e759a705b426baada1fe68cbb0a2e68e34c5d0d0289a364cf01709093", size = 2144218, upload-time = "2025-11-04T13:43:15.431Z" },
+    { url = "https://files.pythonhosted.org/packages/87/57/31b4f8e12680b739a91f472b5671294236b82586889ef764b5fbc6669238/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:c8d8b4eb992936023be7dee581270af5c6e0697a8559895f527f5b7105ecd36a", size = 2329951, upload-time = "2025-11-04T13:43:18.062Z" },
+    { url = "https://files.pythonhosted.org/packages/7d/73/3c2c8edef77b8f7310e6fb012dbc4b8551386ed575b9eb6fb2506e28a7eb/pydantic_core-2.41.5-pp310-pypy310_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:242a206cd0318f95cd21bdacff3fcc3aab23e79bba5cac3db5a841c9ef9c6963", size = 2318428, upload-time = "2025-11-04T13:43:20.679Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/02/8559b1f26ee0d502c74f9cca5c0d2fd97e967e083e006bbbb4e97f3a043a/pydantic_core-2.41.5-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:d3a978c4f57a597908b7e697229d996d77a6d3c94901e9edee593adada95ce1a", size = 2147009, upload-time = "2025-11-04T13:43:23.286Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/9b/1b3f0e9f9305839d7e84912f9e8bfbd191ed1b1ef48083609f0dabde978c/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:b2379fa7ed44ddecb5bfe4e48577d752db9fc10be00a6b7446e9663ba143de26", size = 2101980, upload-time = "2025-11-04T13:43:25.97Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/ed/d71fefcb4263df0da6a85b5d8a7508360f2f2e9b3bf5814be9c8bccdccc1/pydantic_core-2.41.5-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:266fb4cbf5e3cbd0b53669a6d1b039c45e3ce651fd5442eff4d07c2cc8d66808", size = 1923865, upload-time = "2025-11-04T13:43:28.763Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/3a/626b38db460d675f873e4444b4bb030453bbe7b4ba55df821d026a0493c4/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58133647260ea01e4d0500089a8c4f07bd7aa6ce109682b1426394988d8aaacc", size = 2134256, upload-time = "2025-11-04T13:43:31.71Z" },
+    { url = "https://files.pythonhosted.org/packages/83/d9/8412d7f06f616bbc053d30cb4e5f76786af3221462ad5eee1f202021eb4e/pydantic_core-2.41.5-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:287dad91cfb551c363dc62899a80e9e14da1f0e2b6ebde82c806612ca2a13ef1", size = 2174762, upload-time = "2025-11-04T13:43:34.744Z" },
+    { url = "https://files.pythonhosted.org/packages/55/4c/162d906b8e3ba3a99354e20faa1b49a85206c47de97a639510a0e673f5da/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_aarch64.whl", hash = "sha256:03b77d184b9eb40240ae9fd676ca364ce1085f203e1b1256f8ab9984dca80a84", size = 2143141, upload-time = "2025-11-04T13:43:37.701Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/f2/f11dd73284122713f5f89fc940f370d035fa8e1e078d446b3313955157fe/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_armv7l.whl", hash = "sha256:a668ce24de96165bb239160b3d854943128f4334822900534f2fe947930e5770", size = 2330317, upload-time = "2025-11-04T13:43:40.406Z" },
+    { url = "https://files.pythonhosted.org/packages/88/9d/b06ca6acfe4abb296110fb1273a4d848a0bfb2ff65f3ee92127b3244e16b/pydantic_core-2.41.5-pp311-pypy311_pp73-musllinux_1_1_x86_64.whl", hash = "sha256:f14f8f046c14563f8eb3f45f499cc658ab8d10072961e07225e507adb700e93f", size = 2316992, upload-time = "2025-11-04T13:43:43.602Z" },
+    { url = "https://files.pythonhosted.org/packages/36/c7/cfc8e811f061c841d7990b0201912c3556bfeb99cdcb7ed24adc8d6f8704/pydantic_core-2.41.5-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:56121965f7a4dc965bff783d70b907ddf3d57f6eba29b6d2e5dabfaf07799c51", size = 2145302, upload-time = "2025-11-04T13:43:46.64Z" },
 ]
 
 [[package]]
@@ -5111,9 +6206,12 @@ dependencies = [
     { name = "accessible-pygments" },
     { name = "babel" },
     { name = "beautifulsoup4" },
-    { name = "docutils" },
+    { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pygments" },
-    { name = "sphinx" },
+    { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/de/bb/4a97aaa840b26601d6d04deca1389c35025336428706a4a732051187fbd3/pydata_sphinx_theme-0.17.0.tar.gz", hash = "sha256:529c5631582cb3328cf4814fb9eb80611d1704c854406d282a75c9c86e3a1955", size = 4990605, upload-time = "2026-04-03T13:02:20.091Z" }
@@ -5132,7 +6230,7 @@ wheels = [
 
 [[package]]
 name = "pygithub"
-version = "2.9.1"
+version = "2.9.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "pyjwt", extra = ["crypto"] },
@@ -5141,9 +6239,9 @@ dependencies = [
     { name = "typing-extensions" },
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ab/c3/8465a311197e16cf5ab68789fe689535e90f6b61ab524cc32a39e67237ae/pygithub-2.9.1.tar.gz", hash = "sha256:59771d7ff63d54d427be2e7d0dad2208dfffc2b0a045fec959263787739b611c", size = 2594989, upload-time = "2026-04-14T07:26:13.622Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/a6/9a/44f918e9be12e49cb8b053f09d5d0733b74df52bf4dabc570da1c3ecd9f6/pygithub-2.9.0.tar.gz", hash = "sha256:a26abda1222febba31238682634cad11d8b966137ed6cc3c5e445b29a11cb0a4", size = 2592289, upload-time = "2026-03-22T21:14:39.053Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/77/aa/81a5506f089a26338bff17535e4339b3b22049ebd1bcdeff756c4d7a7559/pygithub-2.9.1-py3-none-any.whl", hash = "sha256:2ec78fca30092d51a42d76f4ddb02131b6f0c666a35dfdf364cf302cdda115b9", size = 449710, upload-time = "2026-04-14T07:26:12.382Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/de/72e02bc7674e161b155a4b5a03b2347129d0626115bc97ba5bad5070cac9/pygithub-2.9.0-py3-none-any.whl", hash = "sha256:5e2b260ce327bffce9b00f447b65953ef7078ffe93e5a5425624a3075483927c", size = 449653, upload-time = "2026-03-22T21:14:37.726Z" },
 ]
 
 [[package]]
@@ -5159,6 +6257,9 @@ wheels = [
 name = "pyjwt"
 version = "2.12.1"
 source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
 sdist = { url = "https://files.pythonhosted.org/packages/c2/27/a3b6e5bf6ff856d2509292e95c8f57f0df7017cf5394921fc4e4ef40308a/pyjwt-2.12.1.tar.gz", hash = "sha256:c74a7a2adf861c04d002db713dd85f84beb242228e671280bf709d765b03672b", size = 102564, upload-time = "2026-03-13T19:27:37.25Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/e5/7a/8dd906bd22e79e47397a61742927f6747fe93242ef86645ee9092e610244/pyjwt-2.12.1-py3-none-any.whl", hash = "sha256:28ca37c070cad8ba8cd9790cd940535d40274d22f80ab87f3ac6a713e6e8454c", size = 29726, upload-time = "2026-03-13T19:27:35.677Z" },
@@ -5180,6 +6281,7 @@ dependencies = [
     { name = "isort" },
     { name = "mccabe" },
     { name = "platformdirs" },
+    { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "tomlkit" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/30/10/abee071c1d52b2bca48be40fe9f64ca878a77e0beef6504597e8c9c1ed84/pylint-3.2.6.tar.gz", hash = "sha256:a5d01678349454806cff6d886fb072294f56a58c4761278c97fb557d708e1eb3", size = 1510167, upload-time = "2024-07-21T19:48:38.032Z" }
@@ -5241,10 +6343,11 @@ version = "8.3.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "exceptiongroup", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "iniconfig" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "packaging" },
     { name = "pluggy" },
+    { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ae/3c/c9d525a414d506893f0cd8a8d0de7706446213181570cdbd766691164e40/pytest-8.3.5.tar.gz", hash = "sha256:f4efe70cc14e511565ac476b57c279e12a855b11f48f212af1080ef2263d3845", size = 1450891, upload-time = "2025-03-02T12:54:54.503Z" }
 wheels = [
@@ -5256,6 +6359,7 @@ name = "pytest-asyncio"
 version = "1.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
+    { name = "backports-asyncio-runner", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pytest" },
     { name = "typing-extensions", marker = "python_full_version < '3.13' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
@@ -5269,7 +6373,7 @@ name = "pytest-cov"
 version = "7.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "coverage" },
+    { name = "coverage", extra = ["toml"] },
     { name = "pluggy" },
     { name = "pytest" },
 ]
@@ -5316,15 +6420,15 @@ wheels = [
 
 [[package]]
 name = "python-discovery"
-version = "1.2.2"
+version = "1.2.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "platformdirs" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/de/ef/3bae0e537cfe91e8431efcba4434463d2c5a65f5a89edd47c6cf2f03c55f/python_discovery-1.2.2.tar.gz", hash = "sha256:876e9c57139eb757cb5878cbdd9ae5379e5d96266c99ef731119e04fffe533bb", size = 58872, upload-time = "2026-04-07T17:28:49.249Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b9/88/815e53084c5079a59df912825a279f41dd2e0df82281770eadc732f5352c/python_discovery-1.2.1.tar.gz", hash = "sha256:180c4d114bff1c32462537eac5d6a332b768242b76b69c0259c7d14b1b680c9e", size = 58457, upload-time = "2026-03-26T22:30:44.496Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d8/db/795879cc3ddfe338599bddea6388cc5100b088db0a4caf6e6c1af1c27e04/python_discovery-1.2.2-py3-none-any.whl", hash = "sha256:e1ae95d9af875e78f15e19aed0c6137ab1bb49c200f21f5061786490c9585c7a", size = 31894, upload-time = "2026-04-07T17:28:48.09Z" },
+    { url = "https://files.pythonhosted.org/packages/67/0f/019d3949a40280f6193b62bc010177d4ce702d0fce424322286488569cd3/python_discovery-1.2.1-py3-none-any.whl", hash = "sha256:b6a957b24c1cd79252484d3566d1b49527581d46e789aaf43181005e56201502", size = 31674, upload-time = "2026-03-26T22:30:43.396Z" },
 ]
 
 [[package]]
@@ -5351,11 +6455,11 @@ wheels = [
 
 [[package]]
 name = "python-multipart"
-version = "0.0.26"
+version = "0.0.24"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/88/71/b145a380824a960ebd60e1014256dbb7d2253f2316ff2d73dfd8928ec2c3/python_multipart-0.0.26.tar.gz", hash = "sha256:08fadc45918cd615e26846437f50c5d6d23304da32c341f289a617127b081f17", size = 43501, upload-time = "2026-04-10T14:09:59.473Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/8a/45/e23b5dc14ddb9918ae4a625379506b17b6f8fc56ca1d82db62462f59aea6/python_multipart-0.0.24.tar.gz", hash = "sha256:9574c97e1c026e00bc30340ef7c7d76739512ab4dfd428fec8c330fa6a5cc3c8", size = 37695, upload-time = "2026-04-05T20:49:13.829Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9a/22/f1925cdda983ab66fc8ec6ec8014b959262747e58bdca26a4e3d1da29d56/python_multipart-0.0.26-py3-none-any.whl", hash = "sha256:c0b169f8c4484c13b0dcf2ef0ec3a4adb255c4b7d18d8e420477d2b1dd03f185", size = 28847, upload-time = "2026-04-10T14:09:58.131Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/73/89930efabd4da63cea44a3f438aeb753d600123570e6d6264e763617a9ce/python_multipart-0.0.24-py3-none-any.whl", hash = "sha256:9b110a98db707df01a53c194f0af075e736a770dc5058089650d70b4a182f950", size = 24420, upload-time = "2026-04-05T20:49:12.555Z" },
 ]
 
 [[package]]
@@ -5372,6 +6476,12 @@ name = "pywin32"
 version = "311"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/7b/40/44efbb0dfbd33aca6a6483191dae0716070ed99e2ecb0c53683f400a0b4f/pywin32-311-cp310-cp310-win32.whl", hash = "sha256:d03ff496d2a0cd4a5893504789d4a15399133fe82517455e78bad62efbb7f0a3", size = 8760432, upload-time = "2025-07-14T20:13:05.9Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/bf/360243b1e953bd254a82f12653974be395ba880e7ec23e3731d9f73921cc/pywin32-311-cp310-cp310-win_amd64.whl", hash = "sha256:797c2772017851984b97180b0bebe4b620bb86328e8a884bb626156295a63b3b", size = 9590103, upload-time = "2025-07-14T20:13:07.698Z" },
+    { url = "https://files.pythonhosted.org/packages/57/38/d290720e6f138086fb3d5ffe0b6caa019a791dd57866940c82e4eeaf2012/pywin32-311-cp310-cp310-win_arm64.whl", hash = "sha256:0502d1facf1fed4839a9a51ccbcc63d952cf318f78ffc00a7e78528ac27d7a2b", size = 8778557, upload-time = "2025-07-14T20:13:11.11Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/af/449a6a91e5d6db51420875c54f6aff7c97a86a3b13a0b4f1a5c13b988de3/pywin32-311-cp311-cp311-win32.whl", hash = "sha256:184eb5e436dea364dcd3d2316d577d625c0351bf237c4e9a5fabbcfa5a58b151", size = 8697031, upload-time = "2025-07-14T20:13:13.266Z" },
+    { url = "https://files.pythonhosted.org/packages/51/8f/9bb81dd5bb77d22243d33c8397f09377056d5c687aa6d4042bea7fbf8364/pywin32-311-cp311-cp311-win_amd64.whl", hash = "sha256:3ce80b34b22b17ccbd937a6e78e7225d80c52f5ab9940fe0506a1a16f3dab503", size = 9508308, upload-time = "2025-07-14T20:13:15.147Z" },
+    { url = "https://files.pythonhosted.org/packages/44/7b/9c2ab54f74a138c491aba1b1cd0795ba61f144c711daea84a88b63dc0f6c/pywin32-311-cp311-cp311-win_arm64.whl", hash = "sha256:a733f1388e1a842abb67ffa8e7aad0e70ac519e09b0f6a784e65a136ec7cefd2", size = 8703930, upload-time = "2025-07-14T20:13:16.945Z" },
     { url = "https://files.pythonhosted.org/packages/e7/ab/01ea1943d4eba0f850c3c61e78e8dd59757ff815ff3ccd0a84de5f541f42/pywin32-311-cp312-cp312-win32.whl", hash = "sha256:750ec6e621af2b948540032557b10a2d43b0cee2ae9758c54154d711cc852d31", size = 8706543, upload-time = "2025-07-14T20:13:20.765Z" },
     { url = "https://files.pythonhosted.org/packages/d1/a8/a0e8d07d4d051ec7502cd58b291ec98dcc0c3fff027caad0470b72cfcc2f/pywin32-311-cp312-cp312-win_amd64.whl", hash = "sha256:b8c095edad5c211ff31c05223658e71bf7116daa0ecf3ad85f3201ea3190d067", size = 9495040, upload-time = "2025-07-14T20:13:22.543Z" },
     { url = "https://files.pythonhosted.org/packages/ba/3a/2ae996277b4b50f17d61f0603efd8253cb2d79cc7ae159468007b586396d/pywin32-311-cp312-cp312-win_arm64.whl", hash = "sha256:e286f46a9a39c4a18b319c28f59b61de793654af2f395c102b4f819e584b5852", size = 8710102, upload-time = "2025-07-14T20:13:24.682Z" },
@@ -5389,6 +6499,24 @@ version = "6.0.3"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960, upload-time = "2025-09-25T21:33:16.546Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/a0/39350dd17dd6d6c6507025c0e53aef67a9293a6d37d3511f23ea510d5800/pyyaml-6.0.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:214ed4befebe12df36bcc8bc2b64b396ca31be9304b8f59e25c11cf94a4c033b", size = 184227, upload-time = "2025-09-25T21:31:46.04Z" },
+    { url = "https://files.pythonhosted.org/packages/05/14/52d505b5c59ce73244f59c7a50ecf47093ce4765f116cdb98286a71eeca2/pyyaml-6.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:02ea2dfa234451bbb8772601d7b8e426c2bfa197136796224e50e35a78777956", size = 174019, upload-time = "2025-09-25T21:31:47.706Z" },
+    { url = "https://files.pythonhosted.org/packages/43/f7/0e6a5ae5599c838c696adb4e6330a59f463265bfa1e116cfd1fbb0abaaae/pyyaml-6.0.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b30236e45cf30d2b8e7b3e85881719e98507abed1011bf463a8fa23e9c3e98a8", size = 740646, upload-time = "2025-09-25T21:31:49.21Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/3a/61b9db1d28f00f8fd0ae760459a5c4bf1b941baf714e207b6eb0657d2578/pyyaml-6.0.3-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66291b10affd76d76f54fad28e22e51719ef9ba22b29e1d7d03d6777a9174198", size = 840793, upload-time = "2025-09-25T21:31:50.735Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/1e/7acc4f0e74c4b3d9531e24739e0ab832a5edf40e64fbae1a9c01941cabd7/pyyaml-6.0.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9c7708761fccb9397fe64bbc0395abcae8c4bf7b0eac081e12b809bf47700d0b", size = 770293, upload-time = "2025-09-25T21:31:51.828Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/ef/abd085f06853af0cd59fa5f913d61a8eab65d7639ff2a658d18a25d6a89d/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:418cf3f2111bc80e0933b2cd8cd04f286338bb88bdc7bc8e6dd775ebde60b5e0", size = 732872, upload-time = "2025-09-25T21:31:53.282Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/15/2bc9c8faf6450a8b3c9fc5448ed869c599c0a74ba2669772b1f3a0040180/pyyaml-6.0.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5e0b74767e5f8c593e8c9b5912019159ed0533c70051e9cce3e8b6aa699fcd69", size = 758828, upload-time = "2025-09-25T21:31:54.807Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/00/531e92e88c00f4333ce359e50c19b8d1de9fe8d581b1534e35ccfbc5f393/pyyaml-6.0.3-cp310-cp310-win32.whl", hash = "sha256:28c8d926f98f432f88adc23edf2e6d4921ac26fb084b028c733d01868d19007e", size = 142415, upload-time = "2025-09-25T21:31:55.885Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/fa/926c003379b19fca39dd4634818b00dec6c62d87faf628d1394e137354d4/pyyaml-6.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:bdb2c67c6c1390b63c6ff89f210c8fd09d9a1217a465701eac7316313c915e4c", size = 158561, upload-time = "2025-09-25T21:31:57.406Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/16/a95b6757765b7b031c9374925bb718d55e0a9ba8a1b6a12d25962ea44347/pyyaml-6.0.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:44edc647873928551a01e7a563d7452ccdebee747728c1080d881d68af7b997e", size = 185826, upload-time = "2025-09-25T21:31:58.655Z" },
+    { url = "https://files.pythonhosted.org/packages/16/19/13de8e4377ed53079ee996e1ab0a9c33ec2faf808a4647b7b4c0d46dd239/pyyaml-6.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:652cb6edd41e718550aad172851962662ff2681490a8a711af6a4d288dd96824", size = 175577, upload-time = "2025-09-25T21:32:00.088Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/62/d2eb46264d4b157dae1275b573017abec435397aa59cbcdab6fc978a8af4/pyyaml-6.0.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:10892704fc220243f5305762e276552a0395f7beb4dbf9b14ec8fd43b57f126c", size = 775556, upload-time = "2025-09-25T21:32:01.31Z" },
+    { url = "https://files.pythonhosted.org/packages/10/cb/16c3f2cf3266edd25aaa00d6c4350381c8b012ed6f5276675b9eba8d9ff4/pyyaml-6.0.3-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:850774a7879607d3a6f50d36d04f00ee69e7fc816450e5f7e58d7f17f1ae5c00", size = 882114, upload-time = "2025-09-25T21:32:03.376Z" },
+    { url = "https://files.pythonhosted.org/packages/71/60/917329f640924b18ff085ab889a11c763e0b573da888e8404ff486657602/pyyaml-6.0.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8bb0864c5a28024fac8a632c443c87c5aa6f215c0b126c449ae1a150412f31d", size = 806638, upload-time = "2025-09-25T21:32:04.553Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/6f/529b0f316a9fd167281a6c3826b5583e6192dba792dd55e3203d3f8e655a/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1d37d57ad971609cf3c53ba6a7e365e40660e3be0e5175fa9f2365a379d6095a", size = 767463, upload-time = "2025-09-25T21:32:06.152Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/6a/b627b4e0c1dd03718543519ffb2f1deea4a1e6d42fbab8021936a4d22589/pyyaml-6.0.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:37503bfbfc9d2c40b344d06b2199cf0e96e97957ab1c1b546fd4f87e53e5d3e4", size = 794986, upload-time = "2025-09-25T21:32:07.367Z" },
+    { url = "https://files.pythonhosted.org/packages/45/91/47a6e1c42d9ee337c4839208f30d9f09caa9f720ec7582917b264defc875/pyyaml-6.0.3-cp311-cp311-win32.whl", hash = "sha256:8098f252adfa6c80ab48096053f512f2321f0b998f98150cea9bd23d83e1467b", size = 142543, upload-time = "2025-09-25T21:32:08.95Z" },
+    { url = "https://files.pythonhosted.org/packages/da/e3/ea007450a105ae919a72393cb06f122f288ef60bba2dc64b26e2646fa315/pyyaml-6.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:9f3bfb4965eb874431221a3ff3fdcddc7e74e3b07799e0e84ca4a0f867d449bf", size = 158763, upload-time = "2025-09-25T21:32:09.96Z" },
     { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063, upload-time = "2025-09-25T21:32:11.445Z" },
     { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973, upload-time = "2025-09-25T21:32:12.492Z" },
     { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116, upload-time = "2025-09-25T21:32:13.652Z" },
@@ -5475,30 +6603,34 @@ wheels = [
 
 [[package]]
 name = "ray"
-version = "2.55.0"
+version = "2.54.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
     { name = "filelock" },
     { name = "jsonschema" },
     { name = "msgpack" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "packaging" },
     { name = "protobuf" },
     { name = "pyyaml" },
     { name = "requests" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f9/d4/586e6a696004262b74a7a53b3860c3178b9940ed3c5ee5d909cb7f8f3149/ray-2.55.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6f0b8dfa3716cc9be5fce3b53e9bfdb73cea36025bfe6f1d27928d0f84cfd695", size = 65822329, upload-time = "2026-04-15T04:32:14.218Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/6a/0c1a1179832b9dd93c615289ab92eefd5d844f6e6cea313db09bd55b84da/ray-2.55.0-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:b77f406072ac0ce90431ac436828f364c183ab57ba15c3a0e688a74ae3c2d3f3", size = 72910696, upload-time = "2026-04-15T04:32:19.915Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/a5/2a17fd0aed4f053462eeabfeb6b3ff1b46a85ac9a7da1ebf99d60683f3c2/ray-2.55.0-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:cd460bdbf8a8a4bb768a20c38b1c534d84fe63bc0e5f3580c5c0ef7302b986b3", size = 73765215, upload-time = "2026-04-15T04:32:25.486Z" },
-    { url = "https://files.pythonhosted.org/packages/ea/ac/40ee9b4a514366a1dc8ed50a16a4fc095192c89f2eccb30b15ab710addf3/ray-2.55.0-cp312-cp312-win_amd64.whl", hash = "sha256:5da06d27358d38c30a723a617bf9b7df138f4d90e8046f1fa51d9b8c7473b64a", size = 27865785, upload-time = "2026-04-15T04:32:31.587Z" },
-    { url = "https://files.pythonhosted.org/packages/a5/47/c57556d981a7ed0fe1438f9ff1ecd601bf7d5c704e7698bf181536470acc/ray-2.55.0-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:1da8b8755b6e4fde03db78b6ce2bbcecfcfbd20d39b93833d246c515daeedf3c", size = 65766784, upload-time = "2026-04-15T04:32:36.85Z" },
-    { url = "https://files.pythonhosted.org/packages/00/e8/09ebc53f76800130da0a38bb28e14924ec28daa0a9f41b75146056b52f7c/ray-2.55.0-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:ceee87a884602aab34db109415e6839a6e9169f4750cab727b7ea1610df5b91f", size = 72818556, upload-time = "2026-04-15T04:32:42.275Z" },
-    { url = "https://files.pythonhosted.org/packages/72/6c/97bd20bc62e5dc1b40784261e38f5fce9aee9765c51332a41e95083507a1/ray-2.55.0-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:f16dea32e5cc58ed406c0ef0dd4be69d60ce77a075edb5f0380356a48bf85ab3", size = 73678945, upload-time = "2026-04-15T04:32:49.379Z" },
-    { url = "https://files.pythonhosted.org/packages/9b/90/2b71910c00372634c73a15d5252a09999b1a806301e9759c90dc71479247/ray-2.55.0-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:d48bc4533b3b76d59ed3f9eab1e6b7322a53a7cdefb8f657d9b46eebad56dbee", size = 65774400, upload-time = "2026-04-15T04:32:55.279Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/72/0283b4d2289567511918de44a890d3dc3f5da529112481b75402bb67550d/ray-2.55.0-cp314-cp314-manylinux2014_aarch64.whl", hash = "sha256:b74390f201f28f05c8f250069dfed54d6d6a0109ffe482425d76c11be820e309", size = 72813920, upload-time = "2026-04-15T04:33:00.899Z" },
-    { url = "https://files.pythonhosted.org/packages/a8/da/f701527fe5e3b84afbdffe206b39d2929013301ef15660b99794debdf2d3/ray-2.55.0-cp314-cp314-manylinux2014_x86_64.whl", hash = "sha256:eb0a6179641bc420a66ee85cc9b382e58f22effbd36297e3683a793e5cdc0898", size = 73644251, upload-time = "2026-04-15T04:33:07.754Z" },
+    { url = "https://files.pythonhosted.org/packages/af/cf/9a6e33b59e1a12428b4fbd6cc38f7e32d116ccde4c72e15c3f76a22bf36d/ray-2.54.1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:2ea650e648acc6e76edd98c694657fd1fcb1cd97700d944a7d20da90269e9810", size = 70088753, upload-time = "2026-03-25T22:40:08.213Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/64/fd46863a479ca62c0110f8f56db71edb871ddba137d4701efba0c5951600/ray-2.54.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:6425f15cfe6a298366b53c8658350f94ced2c548802ca3b69f94b87db16e97c5", size = 71702573, upload-time = "2026-03-25T22:40:15.403Z" },
+    { url = "https://files.pythonhosted.org/packages/55/96/7911234a14b891320e652b5ae258050f98584f22a8e33afba9ad43ab27c9/ray-2.54.1-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:054985194bd32f4464c93f9318d247fac61e1f32ac221565ecfdc81ab8c75d0b", size = 72537837, upload-time = "2026-03-25T22:40:21.418Z" },
+    { url = "https://files.pythonhosted.org/packages/41/2a/5cac846ed9b247bc47c3b2618b8b550c52e56fb82be923cae37d9c1161aa/ray-2.54.1-cp310-cp310-win_amd64.whl", hash = "sha256:512587412e2f5e1753adabfdfa4dd9cff1dc509601e36fd5fab671e448ae4dac", size = 27452712, upload-time = "2026-03-25T22:40:26.351Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/1b/f08534e625011fe07017b788ade9bde7570e2e5e0687984d51ced2935c69/ray-2.54.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:c0240496af274af7cd3b1b1d015f23b88e5fdafe59bfdc040e5f229e0aff5dff", size = 70090027, upload-time = "2026-03-25T22:40:32.816Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/90/3455fce4485140aed0f00433fd55294365f1b707dfd547cad6427212bca2/ray-2.54.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:86c51eafd3e84dad59c1ef4cf97b3ac8c088af0705782ee915e31bca5880597a", size = 71798478, upload-time = "2026-03-25T22:40:39.058Z" },
+    { url = "https://files.pythonhosted.org/packages/34/61/04bb126d798962970cca5c88394edee862e91bf97b5e6abbee1478e0f9fc/ray-2.54.1-cp311-cp311-manylinux2014_x86_64.whl", hash = "sha256:e095dfe9c521a04e5930520b4a82ea82d61903d4cd2f3270fbc5dfbdb41b9c72", size = 72631241, upload-time = "2026-03-25T22:40:44.981Z" },
+    { url = "https://files.pythonhosted.org/packages/82/eb/d5a27dc5f07d9f1e50a3b573305ae6272eb5a43c8323994d6168bffa443e/ray-2.54.1-cp311-cp311-win_amd64.whl", hash = "sha256:ea90bed0110e0ce3ff6571e7a0c800920a3c6d299d29b8eac020dac362667169", size = 27449001, upload-time = "2026-03-25T22:40:49.852Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/ca/f3274e0d513c44949ea9167c12c07f9971e5f25ef22b698448a6ca831434/ray-2.54.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:645ebfb73cfd32bd510a05ed9f2738a18d6db69929cae9701d749f2740dbfd9a", size = 70076126, upload-time = "2026-03-25T22:40:55.188Z" },
+    { url = "https://files.pythonhosted.org/packages/51/6f/bf1b7a6d4424c19add99eb17398c7522473502193540b679f8b94fbf2d72/ray-2.54.1-cp312-cp312-manylinux2014_aarch64.whl", hash = "sha256:cd452b61ae2e0daf9271f5a554614397429cc2731681bae10fe72316dadc2749", size = 71831684, upload-time = "2026-03-25T22:41:01.356Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/1f/b33d5006823f8c1c8760887cf1190194f4b06de858b3d17e37bd930a6a62/ray-2.54.1-cp312-cp312-manylinux2014_x86_64.whl", hash = "sha256:4c6f7e23dda62a32f94083141c3f97e9c4246e3ae4ae2bc488bcd8fd0311f54a", size = 72688748, upload-time = "2026-03-25T22:41:07.43Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/7d/02b46d3fe644e1feef62b9e4ebf8cbfc17c6b2d283763208abc52c3dc85e/ray-2.54.1-cp312-cp312-win_amd64.whl", hash = "sha256:673a895c0c4a716ed772552baa3f5b8d7d1f7a4b34e04787fdfe6fe3049ed0d8", size = 27427871, upload-time = "2026-03-25T22:41:12.485Z" },
+    { url = "https://files.pythonhosted.org/packages/80/30/90f9f8f0fcba72b898c40854e020c9d5330f33b4ccd711747cc07e061416/ray-2.54.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:d05f477d1518a00fd5880644e889a7a3eaf64ae5d1f8f239a682d052ad2a383d", size = 70023037, upload-time = "2026-03-25T22:41:17.895Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/5d/fe0e8ac47f6b362c81f391d7f8d2a6858d0bafcc2c37631dc5cc04a16545/ray-2.54.1-cp313-cp313-manylinux2014_aarch64.whl", hash = "sha256:2766f0230806480c38a9a94502087f1d4aea919f38521a28781690613b0290a4", size = 71738623, upload-time = "2026-03-25T22:41:23.898Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/22/48008a626e719baee2012080b960687cc6417b572b363c1c29fe23d119c3/ray-2.54.1-cp313-cp313-manylinux2014_x86_64.whl", hash = "sha256:0c3ae2943176e7b239c78b825a5b2bf4135d90280083a0e19c0a75a5db4d836f", size = 72603355, upload-time = "2026-03-25T22:41:29.802Z" },
 ]
 
 [package.optional-dependencies]
@@ -5541,6 +6673,39 @@ version = "2026.4.4"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/cb/0e/3a246dbf05666918bd3664d9d787f84a9108f6f43cc953a077e4a7dfdb7e/regex-2026.4.4.tar.gz", hash = "sha256:e08270659717f6973523ce3afbafa53515c4dc5dcad637dc215b6fd50f689423", size = 416000, upload-time = "2026-04-03T20:56:28.155Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/12/59/fd98f8fd54b3feaa76a855324c676c17668c5a1121ec91b7ec96b01bf865/regex-2026.4.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:74fa82dcc8143386c7c0392e18032009d1db715c25f4ba22d23dc2e04d02a20f", size = 489403, upload-time = "2026-04-03T20:52:39.742Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/64/d0f222f68e3579d50babf0e4fcc9c9639ef0587fecc00b15e1e46bfc32fa/regex-2026.4.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a85b620a388d6c9caa12189233109e236b3da3deffe4ff11b84ae84e218a274f", size = 291208, upload-time = "2026-04-03T20:52:42.943Z" },
+    { url = "https://files.pythonhosted.org/packages/16/7f/3fab9709b0b0060ba81a04b8a107b34147cd14b9c5551b772154d6505504/regex-2026.4.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2895506ebe32cc63eeed8f80e6eae453171cfccccab35b70dc3129abec35a5b8", size = 289214, upload-time = "2026-04-03T20:52:44.648Z" },
+    { url = "https://files.pythonhosted.org/packages/14/bc/f5dcf04fd462139dcd75495c02eee22032ef741cfa151386a39c3f5fc9b5/regex-2026.4.4-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6780f008ee81381c737634e75c24e5a6569cc883c4f8e37a37917ee79efcafd9", size = 785505, upload-time = "2026-04-03T20:52:46.35Z" },
+    { url = "https://files.pythonhosted.org/packages/37/36/8a906e216d5b4de7ec3788c1d589b45db40c1c9580cd7b326835cfc976d4/regex-2026.4.4-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:88e9b048345c613f253bea4645b2fe7e579782b82cac99b1daad81e29cc2ed8e", size = 852129, upload-time = "2026-04-03T20:52:48.661Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/bb/bad2d79be0917a6ef31f5e0f161d9265cb56fd90a3ae1d2e8d991882a48b/regex-2026.4.4-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:be061028481186ba62a0f4c5f1cc1e3d5ab8bce70c89236ebe01023883bc903b", size = 899578, upload-time = "2026-04-03T20:52:50.61Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/b9/7cd0ceb58cd99c70806241636640ae15b4a3fe62e22e9b99afa67a0d7965/regex-2026.4.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d2228c02b368d69b724c36e96d3d1da721561fb9cc7faa373d7bf65e07d75cb5", size = 793634, upload-time = "2026-04-03T20:52:53Z" },
+    { url = "https://files.pythonhosted.org/packages/2c/fb/c58e3ea40ed183806ccbac05c29a3e8c2f88c1d3a66ed27860d5cad7c62d/regex-2026.4.4-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0540e5b733618a2f84e9cb3e812c8afa82e151ca8e19cf6c4e95c5a65198236f", size = 786210, upload-time = "2026-04-03T20:52:54.713Z" },
+    { url = "https://files.pythonhosted.org/packages/54/a9/53790fc7a6c948a7be2bc7214fd9cabdd0d1ba561b0f401c91f4ff0357f0/regex-2026.4.4-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cf9b1b2e692d4877880388934ac746c99552ce6bf40792a767fd42c8c99f136d", size = 769930, upload-time = "2026-04-03T20:52:56.825Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/3c/29ca44729191c79f5476538cd0fa04fa2553b3c45508519ecea4c7afa8f6/regex-2026.4.4-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:011bb48bffc1b46553ac704c975b3348717f4e4aa7a67522b51906f99da1820c", size = 774892, upload-time = "2026-04-03T20:52:58.934Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/db/6ae74ef8a4cfead341c367e4eed45f71fb1aaba35827a775eed4f1ba4f74/regex-2026.4.4-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8512fcdb43f1bf18582698a478b5ab73f9c1667a5b7548761329ef410cd0a760", size = 848816, upload-time = "2026-04-03T20:53:00.684Z" },
+    { url = "https://files.pythonhosted.org/packages/53/9a/f7f2c1c6b610d7c6de1c3dc5951effd92c324b1fde761af2044b4721020f/regex-2026.4.4-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:867bddc63109a0276f5a31999e4c8e0eb7bbbad7d6166e28d969a2c1afeb97f9", size = 758363, upload-time = "2026-04-03T20:53:02.155Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/55/e5386d393bbf8b43c8b084703a46d635e7b2bdc6e0f5909a2619ea1125f1/regex-2026.4.4-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:1b9a00b83f3a40e09859c78920571dcb83293c8004079653dd22ec14bbfa98c7", size = 837122, upload-time = "2026-04-03T20:53:03.727Z" },
+    { url = "https://files.pythonhosted.org/packages/01/da/cc78710ea2e60b10bacfcc9beb18c67514200ab03597b3b2b319995785c2/regex-2026.4.4-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e355be718caf838aa089870259cf1776dc2a4aa980514af9d02c59544d9a8b22", size = 782140, upload-time = "2026-04-03T20:53:05.608Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/5f/c7bcba41529105d6c2ca7080ecab7184cd00bee2e1ad1fdea80e618704ea/regex-2026.4.4-cp310-cp310-win32.whl", hash = "sha256:33bfda9684646d323414df7abe5692c61d297dbb0530b28ec66442e768813c59", size = 266225, upload-time = "2026-04-03T20:53:07.342Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/26/a745729c2c49354ec4f4bce168f29da932ca01b4758227686cc16c7dde1b/regex-2026.4.4-cp310-cp310-win_amd64.whl", hash = "sha256:0709f22a56798457ae317bcce42aacee33c680068a8f14097430d9f9ba364bee", size = 278393, upload-time = "2026-04-03T20:53:08.65Z" },
+    { url = "https://files.pythonhosted.org/packages/87/8b/4327eeb9dbb4b098ebecaf02e9f82b79b6077beeb54c43d9a0660cf7c44c/regex-2026.4.4-cp310-cp310-win_arm64.whl", hash = "sha256:ee9627de8587c1a22201cb16d0296ab92b4df5cdcb5349f4e9744d61db7c7c98", size = 270470, upload-time = "2026-04-03T20:53:10.018Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/7a/617356cbecdb452812a5d42f720d6d5096b360d4a4c1073af700ea140ad2/regex-2026.4.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b4c36a85b00fadb85db9d9e90144af0a980e1a3d2ef9cd0f8a5bef88054657c6", size = 489415, upload-time = "2026-04-03T20:53:11.645Z" },
+    { url = "https://files.pythonhosted.org/packages/20/e6/bf057227144d02e3ba758b66649e87531d744dda5f3254f48660f18ae9d8/regex-2026.4.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:dcb5453ecf9cd58b562967badd1edbf092b0588a3af9e32ee3d05c985077ce87", size = 291205, upload-time = "2026-04-03T20:53:13.289Z" },
+    { url = "https://files.pythonhosted.org/packages/eb/3b/637181b787dd1a820ba1c712cee2b4144cd84a32dc776ca067b12b2d70c8/regex-2026.4.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6aa809ed4dc3706cc38594d67e641601bd2f36d5555b2780ff074edfcb136cf8", size = 289225, upload-time = "2026-04-03T20:53:16.002Z" },
+    { url = "https://files.pythonhosted.org/packages/05/21/bac05d806ed02cd4b39d9c8e5b5f9a2998c94c3a351b7792e80671fa5315/regex-2026.4.4-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:33424f5188a7db12958246a54f59a435b6cb62c5cf9c8d71f7cc49475a5fdada", size = 792434, upload-time = "2026-04-03T20:53:17.414Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/17/c65d1d8ae90b772d5758eb4014e1e011bb2db353fc4455432e6cc9100df7/regex-2026.4.4-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7d346fccdde28abba117cc9edc696b9518c3307fbfcb689e549d9b5979018c6d", size = 861730, upload-time = "2026-04-03T20:53:18.903Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/64/933321aa082a2c6ee2785f22776143ba89840189c20d3b6b1d12b6aae16b/regex-2026.4.4-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:415a994b536440f5011aa77e50a4274d15da3245e876e5c7f19da349caaedd87", size = 906495, upload-time = "2026-04-03T20:53:20.561Z" },
+    { url = "https://files.pythonhosted.org/packages/01/ea/4c8d306e9c36ac22417336b1e02e7b358152c34dc379673f2d331143725f/regex-2026.4.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21e5eb86179b4c67b5759d452ea7c48eb135cd93308e7a260aa489ed2eb423a4", size = 799810, upload-time = "2026-04-03T20:53:22.961Z" },
+    { url = "https://files.pythonhosted.org/packages/29/ce/7605048f00e1379eba89d610c7d644d8f695dc9b26d3b6ecfa3132b872ff/regex-2026.4.4-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:312ec9dd1ae7d96abd8c5a36a552b2139931914407d26fba723f9e53c8186f86", size = 774242, upload-time = "2026-04-03T20:53:25.015Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/77/283e0d5023fde22cd9e86190d6d9beb21590a452b195ffe00274de470691/regex-2026.4.4-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a0d2b28aa1354c7cd7f71b7658c4326f7facac106edd7f40eda984424229fd59", size = 781257, upload-time = "2026-04-03T20:53:26.918Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/fb/7f3b772be101373c8626ed34c5d727dcbb8abd42a7b1219bc25fd9a3cc04/regex-2026.4.4-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:349d7310eddff40429a099c08d995c6d4a4bfaf3ff40bd3b5e5cb5a5a3c7d453", size = 854490, upload-time = "2026-04-03T20:53:29.065Z" },
+    { url = "https://files.pythonhosted.org/packages/85/30/56547b80f34f4dd2986e1cdd63b1712932f63b6c4ce2f79c50a6cd79d1c2/regex-2026.4.4-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:e7ab63e9fe45a9ec3417509e18116b367e89c9ceb6219222a3396fa30b147f80", size = 763544, upload-time = "2026-04-03T20:53:30.917Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/2f/ce060fdfea8eff34a8997603532e44cdb7d1f35e3bc253612a8707a90538/regex-2026.4.4-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:fe896e07a5a2462308297e515c0054e9ec2dd18dfdc9427b19900b37dfe6f40b", size = 844442, upload-time = "2026-04-03T20:53:32.463Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/44/810cb113096a1dacbe82789fbfab2823f79d19b7f1271acecb7009ba9b88/regex-2026.4.4-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:eb59c65069498dbae3c0ef07bbe224e1eaa079825a437fb47a479f0af11f774f", size = 789162, upload-time = "2026-04-03T20:53:34.039Z" },
+    { url = "https://files.pythonhosted.org/packages/20/96/9647dd7f2ecf6d9ce1fb04dfdb66910d094e10d8fe53e9c15096d8aa0bd2/regex-2026.4.4-cp311-cp311-win32.whl", hash = "sha256:2a5d273181b560ef8397c8825f2b9d57013de744da9e8257b8467e5da8599351", size = 266227, upload-time = "2026-04-03T20:53:35.601Z" },
+    { url = "https://files.pythonhosted.org/packages/33/80/74e13262460530c3097ff343a17de9a34d040a5dc4de9cf3a8241faab51c/regex-2026.4.4-cp311-cp311-win_amd64.whl", hash = "sha256:9542ccc1e689e752594309444081582f7be2fdb2df75acafea8a075108566735", size = 278399, upload-time = "2026-04-03T20:53:37.021Z" },
+    { url = "https://files.pythonhosted.org/packages/1c/3c/39f19f47f19dcefa3403f09d13562ca1c0fd07ab54db2bc03148f3f6b46a/regex-2026.4.4-cp311-cp311-win_arm64.whl", hash = "sha256:b5f9fb784824a042be3455b53d0b112655686fdb7a91f88f095f3fee1e2a2a54", size = 270473, upload-time = "2026-04-03T20:53:38.633Z" },
     { url = "https://files.pythonhosted.org/packages/e5/28/b972a4d3df61e1d7bcf1b59fdb3cddef22f88b6be43f161bb41ebc0e4081/regex-2026.4.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:c07ab8794fa929e58d97a0e1796b8b76f70943fa39df225ac9964615cf1f9d52", size = 490434, upload-time = "2026-04-03T20:53:40.219Z" },
     { url = "https://files.pythonhosted.org/packages/84/20/30041446cf6dc3e0eab344fc62770e84c23b6b68a3b657821f9f80cb69b4/regex-2026.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2c785939dc023a1ce4ec09599c032cc9933d258a998d16ca6f2b596c010940eb", size = 292061, upload-time = "2026-04-03T20:53:41.862Z" },
     { url = "https://files.pythonhosted.org/packages/62/c8/3baa06d75c98c46d4cc4262b71fd2edb9062b5665e868bca57859dadf93a/regex-2026.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1b1ce5c81c9114f1ce2f9288a51a8fd3aeea33a0cc440c415bf02da323aa0a76", size = 289628, upload-time = "2026-04-03T20:53:43.701Z" },
@@ -5650,30 +6815,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" },
 ]
 
-[[package]]
-name = "responses"
-version = "0.18.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "requests", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "urllib3", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/03/a5/186653e51cb20fe3ac793403334d4d077fbb7bb18a9c5c2fce8304d5a2e2/responses-0.18.0.tar.gz", hash = "sha256:380cad4c1c1dc942e5e8a8eaae0b4d4edf708f4f010db8b7bcfafad1fcd254ff", size = 45885, upload-time = "2022-02-02T19:59:52.834Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/79/f3/2b3a6dc5986303b3dd1bbbcf482022acb2583c428cd23f0b6d37b1a1a519/responses-0.18.0-py3-none-any.whl", hash = "sha256:15c63ad16de13ee8e7182d99c9334f64fd81f1ee79f90748d527c28f7ca9dd51", size = 38735, upload-time = "2022-02-02T19:59:52.833Z" },
-]
-
 [[package]]
 name = "rich"
-version = "15.0.0"
+version = "14.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markdown-it-py" },
+    { name = "markdown-it-py", version = "3.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "markdown-it-py", version = "4.0.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pygments" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680, upload-time = "2026-04-12T08:24:00.75Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/b3/c6/f3b320c27991c46f43ee9d856302c70dc2d0fb2dba4842ff739d5f46b393/rich-14.3.3.tar.gz", hash = "sha256:b8daa0b9e4eef54dd8cf7c86c03713f53241884e814f4e2f5fb342fe520f639b", size = 230582, upload-time = "2026-02-19T17:23:12.474Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654, upload-time = "2026-04-12T08:24:02.83Z" },
+    { url = "https://files.pythonhosted.org/packages/14/25/b208c5683343959b670dc001595f2f3737e051da617f66c31f7c4fa93abc/rich-14.3.3-py3-none-any.whl", hash = "sha256:793431c1f8619afa7d3b52b2cdec859562b950ea0d4b6b505397612db8d5362d", size = 310458, upload-time = "2026-02-19T17:23:13.732Z" },
 ]
 
 [[package]]
@@ -5691,6 +6844,35 @@ version = "0.30.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/20/af/3f2f423103f1113b36230496629986e0ef7e199d2aa8392452b484b38ced/rpds_py-0.30.0.tar.gz", hash = "sha256:dd8ff7cf90014af0c0f787eea34794ebf6415242ee1d6fa91eaba725cc441e84", size = 69469, upload-time = "2025-11-30T20:24:38.837Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/06/0c/0c411a0ec64ccb6d104dcabe0e713e05e153a9a2c3c2bd2b32ce412166fe/rpds_py-0.30.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:679ae98e00c0e8d68a7fda324e16b90fd5260945b45d3b824c892cec9eea3288", size = 370490, upload-time = "2025-11-30T20:21:33.256Z" },
+    { url = "https://files.pythonhosted.org/packages/19/6a/4ba3d0fb7297ebae71171822554abe48d7cab29c28b8f9f2c04b79988c05/rpds_py-0.30.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4cc2206b76b4f576934f0ed374b10d7ca5f457858b157ca52064bdfc26b9fc00", size = 359751, upload-time = "2025-11-30T20:21:34.591Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/7c/e4933565ef7f7a0818985d87c15d9d273f1a649afa6a52ea35ad011195ea/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:389a2d49eded1896c3d48b0136ead37c48e221b391c052fba3f4055c367f60a6", size = 389696, upload-time = "2025-11-30T20:21:36.122Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/01/6271a2511ad0815f00f7ed4390cf2567bec1d4b1da39e2c27a41e6e3b4de/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:32c8528634e1bf7121f3de08fa85b138f4e0dc47657866630611b03967f041d7", size = 403136, upload-time = "2025-11-30T20:21:37.728Z" },
+    { url = "https://files.pythonhosted.org/packages/55/64/c857eb7cd7541e9b4eee9d49c196e833128a55b89a9850a9c9ac33ccf897/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f207f69853edd6f6700b86efb84999651baf3789e78a466431df1331608e5324", size = 524699, upload-time = "2025-11-30T20:21:38.92Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/ed/94816543404078af9ab26159c44f9e98e20fe47e2126d5d32c9d9948d10a/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:67b02ec25ba7a9e8fa74c63b6ca44cf5707f2fbfadae3ee8e7494297d56aa9df", size = 412022, upload-time = "2025-11-30T20:21:40.407Z" },
+    { url = "https://files.pythonhosted.org/packages/61/b5/707f6cf0066a6412aacc11d17920ea2e19e5b2f04081c64526eb35b5c6e7/rpds_py-0.30.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0c0e95f6819a19965ff420f65578bacb0b00f251fefe2c8b23347c37174271f3", size = 390522, upload-time = "2025-11-30T20:21:42.17Z" },
+    { url = "https://files.pythonhosted.org/packages/13/4e/57a85fda37a229ff4226f8cbcf09f2a455d1ed20e802ce5b2b4a7f5ed053/rpds_py-0.30.0-cp310-cp310-manylinux_2_31_riscv64.whl", hash = "sha256:a452763cc5198f2f98898eb98f7569649fe5da666c2dc6b5ddb10fde5a574221", size = 404579, upload-time = "2025-11-30T20:21:43.769Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/da/c9339293513ec680a721e0e16bf2bac3db6e5d7e922488de471308349bba/rpds_py-0.30.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e0b65193a413ccc930671c55153a03ee57cecb49e6227204b04fae512eb657a7", size = 421305, upload-time = "2025-11-30T20:21:44.994Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/be/522cb84751114f4ad9d822ff5a1aa3c98006341895d5f084779b99596e5c/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:858738e9c32147f78b3ac24dc0edb6610000e56dc0f700fd5f651d0a0f0eb9ff", size = 572503, upload-time = "2025-11-30T20:21:46.91Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/9b/de879f7e7ceddc973ea6e4629e9b380213a6938a249e94b0cdbcc325bb66/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:da279aa314f00acbb803da1e76fa18666778e8a8f83484fba94526da5de2cba7", size = 598322, upload-time = "2025-11-30T20:21:48.709Z" },
+    { url = "https://files.pythonhosted.org/packages/48/ac/f01fc22efec3f37d8a914fc1b2fb9bcafd56a299edbe96406f3053edea5a/rpds_py-0.30.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7c64d38fb49b6cdeda16ab49e35fe0da2e1e9b34bc38bd78386530f218b37139", size = 560792, upload-time = "2025-11-30T20:21:50.024Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/da/4e2b19d0f131f35b6146425f846563d0ce036763e38913d917187307a671/rpds_py-0.30.0-cp310-cp310-win32.whl", hash = "sha256:6de2a32a1665b93233cde140ff8b3467bdb9e2af2b91079f0333a0974d12d464", size = 221901, upload-time = "2025-11-30T20:21:51.32Z" },
+    { url = "https://files.pythonhosted.org/packages/96/cb/156d7a5cf4f78a7cc571465d8aec7a3c447c94f6749c5123f08438bcf7bc/rpds_py-0.30.0-cp310-cp310-win_amd64.whl", hash = "sha256:1726859cd0de969f88dc8673bdd954185b9104e05806be64bcd87badbe313169", size = 235823, upload-time = "2025-11-30T20:21:52.505Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/6e/f964e88b3d2abee2a82c1ac8366da848fce1c6d834dc2132c3fda3970290/rpds_py-0.30.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:a2bffea6a4ca9f01b3f8e548302470306689684e61602aa3d141e34da06cf425", size = 370157, upload-time = "2025-11-30T20:21:53.789Z" },
+    { url = "https://files.pythonhosted.org/packages/94/ba/24e5ebb7c1c82e74c4e4f33b2112a5573ddc703915b13a073737b59b86e0/rpds_py-0.30.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc4f992dfe1e2bc3ebc7444f6c7051b4bc13cd8e33e43511e8ffd13bf407010d", size = 359676, upload-time = "2025-11-30T20:21:55.475Z" },
+    { url = "https://files.pythonhosted.org/packages/84/86/04dbba1b087227747d64d80c3b74df946b986c57af0a9f0c98726d4d7a3b/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:422c3cb9856d80b09d30d2eb255d0754b23e090034e1deb4083f8004bd0761e4", size = 389938, upload-time = "2025-11-30T20:21:57.079Z" },
+    { url = "https://files.pythonhosted.org/packages/42/bb/1463f0b1722b7f45431bdd468301991d1328b16cffe0b1c2918eba2c4eee/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:07ae8a593e1c3c6b82ca3292efbe73c30b61332fd612e05abee07c79359f292f", size = 402932, upload-time = "2025-11-30T20:21:58.47Z" },
+    { url = "https://files.pythonhosted.org/packages/99/ee/2520700a5c1f2d76631f948b0736cdf9b0acb25abd0ca8e889b5c62ac2e3/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f90dd7557b6bd57f40abe7747e81e0c0b119bef015ea7726e69fe550e394a4", size = 525830, upload-time = "2025-11-30T20:21:59.699Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/ad/bd0331f740f5705cc555a5e17fdf334671262160270962e69a2bdef3bf76/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:99b47d6ad9a6da00bec6aabe5a6279ecd3c06a329d4aa4771034a21e335c3a97", size = 412033, upload-time = "2025-11-30T20:22:00.991Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/1e/372195d326549bb51f0ba0f2ecb9874579906b97e08880e7a65c3bef1a99/rpds_py-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33f559f3104504506a44bb666b93a33f5d33133765b0c216a5bf2f1e1503af89", size = 390828, upload-time = "2025-11-30T20:22:02.723Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/2b/d88bb33294e3e0c76bc8f351a3721212713629ffca1700fa94979cb3eae8/rpds_py-0.30.0-cp311-cp311-manylinux_2_31_riscv64.whl", hash = "sha256:946fe926af6e44f3697abbc305ea168c2c31d3e3ef1058cf68f379bf0335a78d", size = 404683, upload-time = "2025-11-30T20:22:04.367Z" },
+    { url = "https://files.pythonhosted.org/packages/50/32/c759a8d42bcb5289c1fac697cd92f6fe01a018dd937e62ae77e0e7f15702/rpds_py-0.30.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:495aeca4b93d465efde585977365187149e75383ad2684f81519f504f5c13038", size = 421583, upload-time = "2025-11-30T20:22:05.814Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/81/e729761dbd55ddf5d84ec4ff1f47857f4374b0f19bdabfcf929164da3e24/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d9a0ca5da0386dee0655b4ccdf46119df60e0f10da268d04fe7cc87886872ba7", size = 572496, upload-time = "2025-11-30T20:22:07.713Z" },
+    { url = "https://files.pythonhosted.org/packages/14/f6/69066a924c3557c9c30baa6ec3a0aa07526305684c6f86c696b08860726c/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8d6d1cc13664ec13c1b84241204ff3b12f9bb82464b8ad6e7a5d3486975c2eed", size = 598669, upload-time = "2025-11-30T20:22:09.312Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/48/905896b1eb8a05630d20333d1d8ffd162394127b74ce0b0784ae04498d32/rpds_py-0.30.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:3896fa1be39912cf0757753826bc8bdc8ca331a28a7c4ae46b7a21280b06bb85", size = 561011, upload-time = "2025-11-30T20:22:11.309Z" },
+    { url = "https://files.pythonhosted.org/packages/22/16/cd3027c7e279d22e5eb431dd3c0fbc677bed58797fe7581e148f3f68818b/rpds_py-0.30.0-cp311-cp311-win32.whl", hash = "sha256:55f66022632205940f1827effeff17c4fa7ae1953d2b74a8581baaefb7d16f8c", size = 221406, upload-time = "2025-11-30T20:22:13.101Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/5b/e7b7aa136f28462b344e652ee010d4de26ee9fd16f1bfd5811f5153ccf89/rpds_py-0.30.0-cp311-cp311-win_amd64.whl", hash = "sha256:a51033ff701fca756439d641c0ad09a41d9242fa69121c7d8769604a0a629825", size = 236024, upload-time = "2025-11-30T20:22:14.853Z" },
+    { url = "https://files.pythonhosted.org/packages/14/a6/364bba985e4c13658edb156640608f2c9e1d3ea3c81b27aa9d889fff0e31/rpds_py-0.30.0-cp311-cp311-win_arm64.whl", hash = "sha256:47b0ef6231c58f506ef0b74d44e330405caa8428e770fec25329ed2cb971a229", size = 229069, upload-time = "2025-11-30T20:22:16.577Z" },
     { url = "https://files.pythonhosted.org/packages/03/e7/98a2f4ac921d82f33e03f3835f5bf3a4a40aa1bfdc57975e74a97b2b4bdd/rpds_py-0.30.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a161f20d9a43006833cd7068375a94d035714d73a172b681d8881820600abfad", size = 375086, upload-time = "2025-11-30T20:22:17.93Z" },
     { url = "https://files.pythonhosted.org/packages/4d/a1/bca7fd3d452b272e13335db8d6b0b3ecde0f90ad6f16f3328c6fb150c889/rpds_py-0.30.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6abc8880d9d036ecaafe709079969f56e876fcf107f7a8e9920ba6d5a3878d05", size = 359053, upload-time = "2025-11-30T20:22:19.297Z" },
     { url = "https://files.pythonhosted.org/packages/65/1c/ae157e83a6357eceff62ba7e52113e3ec4834a84cfe07fa4b0757a7d105f/rpds_py-0.30.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca28829ae5f5d569bb62a79512c842a03a12576375d5ece7d2cadf8abe96ec28", size = 390763, upload-time = "2025-11-30T20:22:21.661Z" },
@@ -5764,6 +6946,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/85/70/92482ccffb96f5441aab93e26c4d66489eb599efdcf96fad90c14bbfb976/rpds_py-0.30.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:dbd936cde57abfee19ab3213cf9c26be06d60750e60a8e4dd85d1ab12c8b1f40", size = 556030, upload-time = "2025-11-30T20:24:10.956Z" },
     { url = "https://files.pythonhosted.org/packages/20/53/7c7e784abfa500a2b6b583b147ee4bb5a2b3747a9166bab52fec4b5b5e7d/rpds_py-0.30.0-cp314-cp314t-win32.whl", hash = "sha256:dc824125c72246d924f7f796b4f63c1e9dc810c7d9e2355864b3c3a73d59ade0", size = 211570, upload-time = "2025-11-30T20:24:12.735Z" },
     { url = "https://files.pythonhosted.org/packages/d0/02/fa464cdfbe6b26e0600b62c528b72d8608f5cc49f96b8d6e38c95d60c676/rpds_py-0.30.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27f4b0e92de5bfbc6f86e43959e6edd1425c33b5e69aab0984a72047f2bcf1e3", size = 226532, upload-time = "2025-11-30T20:24:14.634Z" },
+    { url = "https://files.pythonhosted.org/packages/69/71/3f34339ee70521864411f8b6992e7ab13ac30d8e4e3309e07c7361767d91/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c2262bdba0ad4fc6fb5545660673925c2d2a5d9e2e0fb603aad545427be0fc58", size = 372292, upload-time = "2025-11-30T20:24:16.537Z" },
+    { url = "https://files.pythonhosted.org/packages/57/09/f183df9b8f2d66720d2ef71075c59f7e1b336bec7ee4c48f0a2b06857653/rpds_py-0.30.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:ee6af14263f25eedc3bb918a3c04245106a42dfd4f5c2285ea6f997b1fc3f89a", size = 362128, upload-time = "2025-11-30T20:24:18.086Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/68/5c2594e937253457342e078f0cc1ded3dd7b2ad59afdbf2d354869110a02/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3adbb8179ce342d235c31ab8ec511e66c73faa27a47e076ccc92421add53e2bb", size = 391542, upload-time = "2025-11-30T20:24:20.092Z" },
+    { url = "https://files.pythonhosted.org/packages/49/5c/31ef1afd70b4b4fbdb2800249f34c57c64beb687495b10aec0365f53dfc4/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:250fa00e9543ac9b97ac258bd37367ff5256666122c2d0f2bc97577c60a1818c", size = 404004, upload-time = "2025-11-30T20:24:22.231Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/63/0cfbea38d05756f3440ce6534d51a491d26176ac045e2707adc99bb6e60a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9854cf4f488b3d57b9aaeb105f06d78e5529d3145b1e4a41750167e8c213c6d3", size = 527063, upload-time = "2025-11-30T20:24:24.302Z" },
+    { url = "https://files.pythonhosted.org/packages/42/e6/01e1f72a2456678b0f618fc9a1a13f882061690893c192fcad9f2926553a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:993914b8e560023bc0a8bf742c5f303551992dcb85e247b1e5c7f4a7d145bda5", size = 413099, upload-time = "2025-11-30T20:24:25.916Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/25/8df56677f209003dcbb180765520c544525e3ef21ea72279c98b9aa7c7fb/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:58edca431fb9b29950807e301826586e5bbf24163677732429770a697ffe6738", size = 392177, upload-time = "2025-11-30T20:24:27.834Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/b4/0a771378c5f16f8115f796d1f437950158679bcd2a7c68cf251cfb00ed5b/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_31_riscv64.whl", hash = "sha256:dea5b552272a944763b34394d04577cf0f9bd013207bc32323b5a89a53cf9c2f", size = 406015, upload-time = "2025-11-30T20:24:29.457Z" },
+    { url = "https://files.pythonhosted.org/packages/36/d8/456dbba0af75049dc6f63ff295a2f92766b9d521fa00de67a2bd6427d57a/rpds_py-0.30.0-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:ba3af48635eb83d03f6c9735dfb21785303e73d22ad03d489e88adae6eab8877", size = 423736, upload-time = "2025-11-30T20:24:31.22Z" },
+    { url = "https://files.pythonhosted.org/packages/13/64/b4d76f227d5c45a7e0b796c674fd81b0a6c4fbd48dc29271857d8219571c/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:dff13836529b921e22f15cb099751209a60009731a68519630a24d61f0b1b30a", size = 573981, upload-time = "2025-11-30T20:24:32.934Z" },
+    { url = "https://files.pythonhosted.org/packages/20/91/092bacadeda3edf92bf743cc96a7be133e13a39cdbfd7b5082e7ab638406/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_i686.whl", hash = "sha256:1b151685b23929ab7beec71080a8889d4d6d9fa9a983d213f07121205d48e2c4", size = 599782, upload-time = "2025-11-30T20:24:35.169Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/b7/b95708304cd49b7b6f82fdd039f1748b66ec2b21d6a45180910802f1abf1/rpds_py-0.30.0-pp311-pypy311_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:ac37f9f516c51e5753f27dfdef11a88330f04de2d564be3991384b2f3535d02e", size = 562191, upload-time = "2025-11-30T20:24:36.853Z" },
 ]
 
 [[package]]
@@ -5795,52 +6989,16 @@ wheels = [
 name = "s3fs"
 version = "2026.2.0"
 source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-]
 dependencies = [
-    { name = "aiobotocore", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "aiohttp", marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
-    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'win32'" },
+    { name = "aiobotocore" },
+    { name = "aiohttp" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" } },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fa/be/392c8c5e0da9bfa139e41084690dd49a5e3e931099f78f52d3f6070105c6/s3fs-2026.2.0.tar.gz", hash = "sha256:91cb2a9f76e35643b76eeac3f47a6165172bb3def671f76b9111c8dd5779a2ac", size = 84152, upload-time = "2026-02-05T21:57:57.968Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/57/e1/64c264db50b68de8a438b60ceeb921b2f22da3ebb7ad6255150225d0beac/s3fs-2026.2.0-py3-none-any.whl", hash = "sha256:65198835b86b1d5771112b0085d1da52a6ede36508b1aaa6cae2aedc765dfe10", size = 31328, upload-time = "2026-02-05T21:57:56.532Z" },
 ]
 
-[[package]]
-name = "s3fs"
-version = "2026.3.0"
-source = { registry = "https://pypi.org/simple" }
-resolution-markers = [
-    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32'",
-]
-dependencies = [
-    { name = "aiobotocore", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "aiohttp", marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-    { name = "fsspec", version = "2026.3.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' and sys_platform == 'win32'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0b/93/093972862fb9c2fdc24ecf8d6d2212853df1945eddf26ba2625e8eaeee66/s3fs-2026.3.0.tar.gz", hash = "sha256:ce8b30a9dc5e01c5127c96cb7377290243a689a251ef9257336ac29d72d7b0d8", size = 85986, upload-time = "2026-03-27T19:28:20.963Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6a/52/5ccdc01f7a8a61357d15a66b5d8a6580aa8529cb33f32e6cbb71c52622c5/s3fs-2026.3.0-py3-none-any.whl", hash = "sha256:2fa40a64c03003cfa5ae0e352788d97aa78ae8f9e25ea98b28ce9d21ba10c1b8", size = 32399, upload-time = "2026-03-27T19:28:19.702Z" },
-]
-
 [[package]]
 name = "safetensors"
 version = "0.7.0"
@@ -5861,17 +7019,117 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4a/d8/0c8a7dc9b41dcac53c4cbf9df2b9c83e0e0097203de8b37a712b345c0be5/safetensors-0.7.0-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b0f6d66c1c538d5a94a73aa9ddca8ccc4227e6c9ff555322ea40bdd142391dd4", size = 677368, upload-time = "2025-11-19T15:18:41.627Z" },
     { url = "https://files.pythonhosted.org/packages/05/e5/cb4b713c8a93469e3c5be7c3f8d77d307e65fe89673e731f5c2bfd0a9237/safetensors-0.7.0-cp38-abi3-win32.whl", hash = "sha256:c74af94bf3ac15ac4d0f2a7c7b4663a15f8c2ab15ed0fc7531ca61d0835eccba", size = 326423, upload-time = "2025-11-19T15:18:45.74Z" },
     { url = "https://files.pythonhosted.org/packages/5d/e6/ec8471c8072382cb91233ba7267fd931219753bb43814cbc71757bfd4dab/safetensors-0.7.0-cp38-abi3-win_amd64.whl", hash = "sha256:d1239932053f56f3456f32eb9625590cc7582e905021f94636202a864d470755", size = 341380, upload-time = "2025-11-19T15:18:44.427Z" },
+    { url = "https://files.pythonhosted.org/packages/a7/6a/4d08d89a6fcbe905c5ae68b8b34f0791850882fc19782d0d02c65abbdf3b/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f4729811a6640d019a4b7ba8638ee2fd21fa5ca8c7e7bdf0fed62068fcaac737", size = 492430, upload-time = "2025-11-19T15:18:11.884Z" },
+    { url = "https://files.pythonhosted.org/packages/dd/29/59ed8152b30f72c42d00d241e58eaca558ae9dbfa5695206e2e0f54c7063/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:12f49080303fa6bb424b362149a12949dfbbf1e06811a88f2307276b0c131afd", size = 503977, upload-time = "2025-11-19T15:18:17.523Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/0b/4811bfec67fa260e791369b16dab105e4bae82686120554cc484064e22b4/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0071bffba4150c2f46cae1432d31995d77acfd9f8db598b5d1a2ce67e8440ad2", size = 623890, upload-time = "2025-11-19T15:18:22.666Z" },
+    { url = "https://files.pythonhosted.org/packages/58/5b/632a58724221ef03d78ab65062e82a1010e1bef8e8e0b9d7c6d7b8044841/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:473b32699f4200e69801bf5abf93f1a4ecd432a70984df164fc22ccf39c4a6f3", size = 531885, upload-time = "2025-11-19T15:18:27.146Z" },
+]
+
+[[package]]
+name = "scipy"
+version = "1.15.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
+]
+dependencies = [
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0f/37/6964b830433e654ec7485e45a00fc9a27cf868d622838f6b6d9c5ec0d532/scipy-1.15.3.tar.gz", hash = "sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf", size = 59419214, upload-time = "2025-05-08T16:13:05.955Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/78/2f/4966032c5f8cc7e6a60f1b2e0ad686293b9474b65246b0c642e3ef3badd0/scipy-1.15.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a345928c86d535060c9c2b25e71e87c39ab2f22fc96e9636bd74d1dbf9de448c", size = 38702770, upload-time = "2025-05-08T16:04:20.849Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/6e/0c3bf90fae0e910c274db43304ebe25a6b391327f3f10b5dcc638c090795/scipy-1.15.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ad3432cb0f9ed87477a8d97f03b763fd1d57709f1bbde3c9369b1dff5503b253", size = 30094511, upload-time = "2025-05-08T16:04:27.103Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/b1/4deb37252311c1acff7f101f6453f0440794f51b6eacb1aad4459a134081/scipy-1.15.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:aef683a9ae6eb00728a542b796f52a5477b78252edede72b8327a886ab63293f", size = 22368151, upload-time = "2025-05-08T16:04:31.731Z" },
+    { url = "https://files.pythonhosted.org/packages/38/7d/f457626e3cd3c29b3a49ca115a304cebb8cc6f31b04678f03b216899d3c6/scipy-1.15.3-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:1c832e1bd78dea67d5c16f786681b28dd695a8cb1fb90af2e27580d3d0967e92", size = 25121732, upload-time = "2025-05-08T16:04:36.596Z" },
+    { url = "https://files.pythonhosted.org/packages/db/0a/92b1de4a7adc7a15dcf5bddc6e191f6f29ee663b30511ce20467ef9b82e4/scipy-1.15.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:263961f658ce2165bbd7b99fa5135195c3a12d9bef045345016b8b50c315cb82", size = 35547617, upload-time = "2025-05-08T16:04:43.546Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/6d/41991e503e51fc1134502694c5fa7a1671501a17ffa12716a4a9151af3df/scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2abc762b0811e09a0d3258abee2d98e0c703eee49464ce0069590846f31d40", size = 37662964, upload-time = "2025-05-08T16:04:49.431Z" },
+    { url = "https://files.pythonhosted.org/packages/25/e1/3df8f83cb15f3500478c889be8fb18700813b95e9e087328230b98d547ff/scipy-1.15.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed7284b21a7a0c8f1b6e5977ac05396c0d008b89e05498c8b7e8f4a1423bba0e", size = 37238749, upload-time = "2025-05-08T16:04:55.215Z" },
+    { url = "https://files.pythonhosted.org/packages/93/3e/b3257cf446f2a3533ed7809757039016b74cd6f38271de91682aa844cfc5/scipy-1.15.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5380741e53df2c566f4d234b100a484b420af85deb39ea35a1cc1be84ff53a5c", size = 40022383, upload-time = "2025-05-08T16:05:01.914Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/84/55bc4881973d3f79b479a5a2e2df61c8c9a04fcb986a213ac9c02cfb659b/scipy-1.15.3-cp310-cp310-win_amd64.whl", hash = "sha256:9d61e97b186a57350f6d6fd72640f9e99d5a4a2b8fbf4b9ee9a841eab327dc13", size = 41259201, upload-time = "2025-05-08T16:05:08.166Z" },
+    { url = "https://files.pythonhosted.org/packages/96/ab/5cc9f80f28f6a7dff646c5756e559823614a42b1939d86dd0ed550470210/scipy-1.15.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:993439ce220d25e3696d1b23b233dd010169b62f6456488567e830654ee37a6b", size = 38714255, upload-time = "2025-05-08T16:05:14.596Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/4a/66ba30abe5ad1a3ad15bfb0b59d22174012e8056ff448cb1644deccbfed2/scipy-1.15.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:34716e281f181a02341ddeaad584205bd2fd3c242063bd3423d61ac259ca7eba", size = 30111035, upload-time = "2025-05-08T16:05:20.152Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/fa/a7e5b95afd80d24313307f03624acc65801846fa75599034f8ceb9e2cbf6/scipy-1.15.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b0334816afb8b91dab859281b1b9786934392aa3d527cd847e41bb6f45bee65", size = 22384499, upload-time = "2025-05-08T16:05:24.494Z" },
+    { url = "https://files.pythonhosted.org/packages/17/99/f3aaddccf3588bb4aea70ba35328c204cadd89517a1612ecfda5b2dd9d7a/scipy-1.15.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6db907c7368e3092e24919b5e31c76998b0ce1684d51a90943cb0ed1b4ffd6c1", size = 25152602, upload-time = "2025-05-08T16:05:29.313Z" },
+    { url = "https://files.pythonhosted.org/packages/56/c5/1032cdb565f146109212153339f9cb8b993701e9fe56b1c97699eee12586/scipy-1.15.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:721d6b4ef5dc82ca8968c25b111e307083d7ca9091bc38163fb89243e85e3889", size = 35503415, upload-time = "2025-05-08T16:05:34.699Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/37/89f19c8c05505d0601ed5650156e50eb881ae3918786c8fd7262b4ee66d3/scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39cb9c62e471b1bb3750066ecc3a3f3052b37751c7c3dfd0fd7e48900ed52982", size = 37652622, upload-time = "2025-05-08T16:05:40.762Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/31/be59513aa9695519b18e1851bb9e487de66f2d31f835201f1b42f5d4d475/scipy-1.15.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:795c46999bae845966368a3c013e0e00947932d68e235702b5c3f6ea799aa8c9", size = 37244796, upload-time = "2025-05-08T16:05:48.119Z" },
+    { url = "https://files.pythonhosted.org/packages/10/c0/4f5f3eeccc235632aab79b27a74a9130c6c35df358129f7ac8b29f562ac7/scipy-1.15.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:18aaacb735ab38b38db42cb01f6b92a2d0d4b6aabefeb07f02849e47f8fb3594", size = 40047684, upload-time = "2025-05-08T16:05:54.22Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/a7/0ddaf514ce8a8714f6ed243a2b391b41dbb65251affe21ee3077ec45ea9a/scipy-1.15.3-cp311-cp311-win_amd64.whl", hash = "sha256:ae48a786a28412d744c62fd7816a4118ef97e5be0bee968ce8f0a2fba7acf3bb", size = 41246504, upload-time = "2025-05-08T16:06:00.437Z" },
+    { url = "https://files.pythonhosted.org/packages/37/4b/683aa044c4162e10ed7a7ea30527f2cbd92e6999c10a8ed8edb253836e9c/scipy-1.15.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6ac6310fdbfb7aa6612408bd2f07295bcbd3fda00d2d702178434751fe48e019", size = 38766735, upload-time = "2025-05-08T16:06:06.471Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/7e/f30be3d03de07f25dc0ec926d1681fed5c732d759ac8f51079708c79e680/scipy-1.15.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:185cd3d6d05ca4b44a8f1595af87f9c372bb6acf9c808e99aa3e9aa03bd98cf6", size = 30173284, upload-time = "2025-05-08T16:06:11.686Z" },
+    { url = "https://files.pythonhosted.org/packages/07/9c/0ddb0d0abdabe0d181c1793db51f02cd59e4901da6f9f7848e1f96759f0d/scipy-1.15.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:05dc6abcd105e1a29f95eada46d4a3f251743cfd7d3ae8ddb4088047f24ea477", size = 22446958, upload-time = "2025-05-08T16:06:15.97Z" },
+    { url = "https://files.pythonhosted.org/packages/af/43/0bce905a965f36c58ff80d8bea33f1f9351b05fad4beaad4eae34699b7a1/scipy-1.15.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:06efcba926324df1696931a57a176c80848ccd67ce6ad020c810736bfd58eb1c", size = 25242454, upload-time = "2025-05-08T16:06:20.394Z" },
+    { url = "https://files.pythonhosted.org/packages/56/30/a6f08f84ee5b7b28b4c597aca4cbe545535c39fe911845a96414700b64ba/scipy-1.15.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05045d8b9bfd807ee1b9f38761993297b10b245f012b11b13b91ba8945f7e45", size = 35210199, upload-time = "2025-05-08T16:06:26.159Z" },
+    { url = "https://files.pythonhosted.org/packages/0b/1f/03f52c282437a168ee2c7c14a1a0d0781a9a4a8962d84ac05c06b4c5b555/scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:271e3713e645149ea5ea3e97b57fdab61ce61333f97cfae392c28ba786f9bb49", size = 37309455, upload-time = "2025-05-08T16:06:32.778Z" },
+    { url = "https://files.pythonhosted.org/packages/89/b1/fbb53137f42c4bf630b1ffdfc2151a62d1d1b903b249f030d2b1c0280af8/scipy-1.15.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6cfd56fc1a8e53f6e89ba3a7a7251f7396412d655bca2aa5611c8ec9a6784a1e", size = 36885140, upload-time = "2025-05-08T16:06:39.249Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/2e/025e39e339f5090df1ff266d021892694dbb7e63568edcfe43f892fa381d/scipy-1.15.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0ff17c0bb1cb32952c09217d8d1eed9b53d1463e5f1dd6052c7857f83127d539", size = 39710549, upload-time = "2025-05-08T16:06:45.729Z" },
+    { url = "https://files.pythonhosted.org/packages/e6/eb/3bf6ea8ab7f1503dca3a10df2e4b9c3f6b3316df07f6c0ded94b281c7101/scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed", size = 40966184, upload-time = "2025-05-08T16:06:52.623Z" },
+    { url = "https://files.pythonhosted.org/packages/73/18/ec27848c9baae6e0d6573eda6e01a602e5649ee72c27c3a8aad673ebecfd/scipy-1.15.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2c620736bcc334782e24d173c0fdbb7590a0a436d2fdf39310a8902505008759", size = 38728256, upload-time = "2025-05-08T16:06:58.696Z" },
+    { url = "https://files.pythonhosted.org/packages/74/cd/1aef2184948728b4b6e21267d53b3339762c285a46a274ebb7863c9e4742/scipy-1.15.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:7e11270a000969409d37ed399585ee530b9ef6aa99d50c019de4cb01e8e54e62", size = 30109540, upload-time = "2025-05-08T16:07:04.209Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/d8/59e452c0a255ec352bd0a833537a3bc1bfb679944c4938ab375b0a6b3a3e/scipy-1.15.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:8c9ed3ba2c8a2ce098163a9bdb26f891746d02136995df25227a20e71c396ebb", size = 22383115, upload-time = "2025-05-08T16:07:08.998Z" },
+    { url = "https://files.pythonhosted.org/packages/08/f5/456f56bbbfccf696263b47095291040655e3cbaf05d063bdc7c7517f32ac/scipy-1.15.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:0bdd905264c0c9cfa74a4772cdb2070171790381a5c4d312c973382fc6eaf730", size = 25163884, upload-time = "2025-05-08T16:07:14.091Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/66/a9618b6a435a0f0c0b8a6d0a2efb32d4ec5a85f023c2b79d39512040355b/scipy-1.15.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79167bba085c31f38603e11a267d862957cbb3ce018d8b38f79ac043bc92d825", size = 35174018, upload-time = "2025-05-08T16:07:19.427Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/09/c5b6734a50ad4882432b6bb7c02baf757f5b2f256041da5df242e2d7e6b6/scipy-1.15.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9deabd6d547aee2c9a81dee6cc96c6d7e9a9b1953f74850c179f91fdc729cb7", size = 37269716, upload-time = "2025-05-08T16:07:25.712Z" },
+    { url = "https://files.pythonhosted.org/packages/77/0a/eac00ff741f23bcabd352731ed9b8995a0a60ef57f5fd788d611d43d69a1/scipy-1.15.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:dde4fc32993071ac0c7dd2d82569e544f0bdaff66269cb475e0f369adad13f11", size = 36872342, upload-time = "2025-05-08T16:07:31.468Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/54/4379be86dd74b6ad81551689107360d9a3e18f24d20767a2d5b9253a3f0a/scipy-1.15.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f77f853d584e72e874d87357ad70f44b437331507d1c311457bed8ed2b956126", size = 39670869, upload-time = "2025-05-08T16:07:38.002Z" },
+    { url = "https://files.pythonhosted.org/packages/87/2e/892ad2862ba54f084ffe8cc4a22667eaf9c2bcec6d2bff1d15713c6c0703/scipy-1.15.3-cp313-cp313-win_amd64.whl", hash = "sha256:b90ab29d0c37ec9bf55424c064312930ca5f4bde15ee8619ee44e69319aab163", size = 40988851, upload-time = "2025-05-08T16:08:33.671Z" },
+    { url = "https://files.pythonhosted.org/packages/1b/e9/7a879c137f7e55b30d75d90ce3eb468197646bc7b443ac036ae3fe109055/scipy-1.15.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3ac07623267feb3ae308487c260ac684b32ea35fd81e12845039952f558047b8", size = 38863011, upload-time = "2025-05-08T16:07:44.039Z" },
+    { url = "https://files.pythonhosted.org/packages/51/d1/226a806bbd69f62ce5ef5f3ffadc35286e9fbc802f606a07eb83bf2359de/scipy-1.15.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6487aa99c2a3d509a5227d9a5e889ff05830a06b2ce08ec30df6d79db5fcd5c5", size = 30266407, upload-time = "2025-05-08T16:07:49.891Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/9b/f32d1d6093ab9eeabbd839b0f7619c62e46cc4b7b6dbf05b6e615bbd4400/scipy-1.15.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:50f9e62461c95d933d5c5ef4a1f2ebf9a2b4e83b0db374cb3f1de104d935922e", size = 22540030, upload-time = "2025-05-08T16:07:54.121Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/29/c278f699b095c1a884f29fda126340fcc201461ee8bfea5c8bdb1c7c958b/scipy-1.15.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:14ed70039d182f411ffc74789a16df3835e05dc469b898233a245cdfd7f162cb", size = 25218709, upload-time = "2025-05-08T16:07:58.506Z" },
+    { url = "https://files.pythonhosted.org/packages/24/18/9e5374b617aba742a990581373cd6b68a2945d65cc588482749ef2e64467/scipy-1.15.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a769105537aa07a69468a0eefcd121be52006db61cdd8cac8a0e68980bbb723", size = 34809045, upload-time = "2025-05-08T16:08:03.929Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/fe/9c4361e7ba2927074360856db6135ef4904d505e9b3afbbcb073c4008328/scipy-1.15.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9db984639887e3dffb3928d118145ffe40eff2fa40cb241a306ec57c219ebbbb", size = 36703062, upload-time = "2025-05-08T16:08:09.558Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/8e/038ccfe29d272b30086b25a4960f757f97122cb2ec42e62b460d02fe98e9/scipy-1.15.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:40e54d5c7e7ebf1aa596c374c49fa3135f04648a0caabcb66c52884b943f02b4", size = 36393132, upload-time = "2025-05-08T16:08:15.34Z" },
+    { url = "https://files.pythonhosted.org/packages/10/7e/5c12285452970be5bdbe8352c619250b97ebf7917d7a9a9e96b8a8140f17/scipy-1.15.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5e721fed53187e71d0ccf382b6bf977644c533e506c4d33c3fb24de89f5c3ed5", size = 38979503, upload-time = "2025-05-08T16:08:21.513Z" },
+    { url = "https://files.pythonhosted.org/packages/81/06/0a5e5349474e1cbc5757975b21bd4fad0e72ebf138c5592f191646154e06/scipy-1.15.3-cp313-cp313t-win_amd64.whl", hash = "sha256:76ad1fb5f8752eabf0fa02e4cc0336b4e8f021e2d5f061ed37d6d264db35e3ca", size = 40308097, upload-time = "2025-05-08T16:08:27.627Z" },
 ]
 
 [[package]]
 name = "scipy"
 version = "1.17.1"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+]
 dependencies = [
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" } },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7a/97/5a3609c4f8d58b039179648e62dd220f89864f56f7357f5d4f45c29eb2cc/scipy-1.17.1.tar.gz", hash = "sha256:95d8e012d8cb8816c226aef832200b1d45109ed4464303e997c5b13122b297c0", size = 30573822, upload-time = "2026-02-23T00:26:24.851Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/df/75/b4ce781849931fef6fd529afa6b63711d5a733065722d0c3e2724af9e40a/scipy-1.17.1-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:1f95b894f13729334fb990162e911c9e5dc1ab390c58aa6cbecb389c5b5e28ec", size = 31613675, upload-time = "2026-02-23T00:16:00.13Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/58/bccc2861b305abdd1b8663d6130c0b3d7cc22e8d86663edbc8401bfd40d4/scipy-1.17.1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e18f12c6b0bc5a592ed23d3f7b891f68fd7f8241d69b7883769eb5d5dfb52696", size = 28162057, upload-time = "2026-02-23T00:16:09.456Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/ee/18146b7757ed4976276b9c9819108adbc73c5aad636e5353e20746b73069/scipy-1.17.1-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:a3472cfbca0a54177d0faa68f697d8ba4c80bbdc19908c3465556d9f7efce9ee", size = 20334032, upload-time = "2026-02-23T00:16:17.358Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/e6/cef1cf3557f0c54954198554a10016b6a03b2ec9e22a4e1df734936bd99c/scipy-1.17.1-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:766e0dc5a616d026a3a1cffa379af959671729083882f50307e18175797b3dfd", size = 22709533, upload-time = "2026-02-23T00:16:25.791Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/60/8804678875fc59362b0fb759ab3ecce1f09c10a735680318ac30da8cd76b/scipy-1.17.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:744b2bf3640d907b79f3fd7874efe432d1cf171ee721243e350f55234b4cec4c", size = 33062057, upload-time = "2026-02-23T00:16:36.931Z" },
+    { url = "https://files.pythonhosted.org/packages/09/7d/af933f0f6e0767995b4e2d705a0665e454d1c19402aa7e895de3951ebb04/scipy-1.17.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43af8d1f3bea642559019edfe64e9b11192a8978efbd1539d7bc2aaa23d92de4", size = 35349300, upload-time = "2026-02-23T00:16:49.108Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/3d/7ccbbdcbb54c8fdc20d3b6930137c782a163fa626f0aef920349873421ba/scipy-1.17.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:cd96a1898c0a47be4520327e01f874acfd61fb48a9420f8aa9f6483412ffa444", size = 35127333, upload-time = "2026-02-23T00:17:01.293Z" },
+    { url = "https://files.pythonhosted.org/packages/e8/19/f926cb11c42b15ba08e3a71e376d816ac08614f769b4f47e06c3580c836a/scipy-1.17.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4eb6c25dd62ee8d5edf68a8e1c171dd71c292fdae95d8aeb3dd7d7de4c364082", size = 37741314, upload-time = "2026-02-23T00:17:12.576Z" },
+    { url = "https://files.pythonhosted.org/packages/95/da/0d1df507cf574b3f224ccc3d45244c9a1d732c81dcb26b1e8a766ae271a8/scipy-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:d30e57c72013c2a4fe441c2fcb8e77b14e152ad48b5464858e07e2ad9fbfceff", size = 36607512, upload-time = "2026-02-23T00:17:23.424Z" },
+    { url = "https://files.pythonhosted.org/packages/68/7f/bdd79ceaad24b671543ffe0ef61ed8e659440eb683b66f033454dcee90eb/scipy-1.17.1-cp311-cp311-win_arm64.whl", hash = "sha256:9ecb4efb1cd6e8c4afea0daa91a87fbddbce1b99d2895d151596716c0b2e859d", size = 24599248, upload-time = "2026-02-23T00:17:34.561Z" },
     { url = "https://files.pythonhosted.org/packages/35/48/b992b488d6f299dbe3f11a20b24d3dda3d46f1a635ede1c46b5b17a7b163/scipy-1.17.1-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:35c3a56d2ef83efc372eaec584314bd0ef2e2f0d2adb21c55e6ad5b344c0dcb8", size = 31610954, upload-time = "2026-02-23T00:17:49.855Z" },
     { url = "https://files.pythonhosted.org/packages/b2/02/cf107b01494c19dc100f1d0b7ac3cc08666e96ba2d64db7626066cee895e/scipy-1.17.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:fcb310ddb270a06114bb64bbe53c94926b943f5b7f0842194d585c65eb4edd76", size = 28172662, upload-time = "2026-02-23T00:18:01.64Z" },
     { url = "https://files.pythonhosted.org/packages/cf/a9/599c28631bad314d219cf9ffd40e985b24d603fc8a2f4ccc5ae8419a535b/scipy-1.17.1-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cc90d2e9c7e5c7f1a482c9875007c095c3194b1cfedca3c2f3291cdc2bc7c086", size = 20344366, upload-time = "2026-02-23T00:18:12.015Z" },
@@ -5930,6 +7188,22 @@ version = "0.2.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/15/15/2e7a025fc62d764b151ae6d0f2a92f8081755ebe8d4a64099accc6f77ba6/sentencepiece-0.2.1.tar.gz", hash = "sha256:8138cec27c2f2282f4a34d9a016e3374cd40e5c6e9cb335063db66a0a3b71fad", size = 3228515, upload-time = "2025-08-12T07:00:51.718Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/af/31/5b7cccb307b485db1a2372d6d2980b0a65d067f8be5ca943a103b4acd5b3/sentencepiece-0.2.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:e10fa50bdbaa5e2445dbd387979980d391760faf0ec99a09bd7780ff37eaec44", size = 1942557, upload-time = "2025-08-12T06:59:12.379Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/41/0ac923a8e685ad290c5afc8ae55c5844977b8d75076fcc04302b9a324274/sentencepiece-0.2.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:2f27ae6deea72efdb6f361750c92f6c21fd0ad087445082770cc34015213c526", size = 1325384, upload-time = "2025-08-12T06:59:14.334Z" },
+    { url = "https://files.pythonhosted.org/packages/fc/ef/3751555d67daf9003384978f169d31c775cb5c7baf28633caaf1eb2b2b4d/sentencepiece-0.2.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:60937c959e6f44159fdd9f56fbdd302501f96114a5ba436829496d5f32d8de3f", size = 1253317, upload-time = "2025-08-12T06:59:16.247Z" },
+    { url = "https://files.pythonhosted.org/packages/46/a5/742c69b7bd144eb32b6e5fd50dbd8abbbc7a95fce2fe16e50156fa400e3b/sentencepiece-0.2.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8b1d91545578852f128650b8cce4ec20f93d39b378ff554ebe66290f2dabb92", size = 1316379, upload-time = "2025-08-12T06:59:17.825Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/89/8deeafbba2871e8fa10f20f17447786f4ac38085925335728d360eaf4cae/sentencepiece-0.2.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:27e38eee653abc3d387862e67bc5c8b6f428cd604e688b85d29170b7e725c26c", size = 1387926, upload-time = "2025-08-12T06:59:19.395Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/ca/67fe73005f0ab617c6a970b199754e28e524b6873aa7025224fad3cda252/sentencepiece-0.2.1-cp310-cp310-win32.whl", hash = "sha256:251874d720ac7f28024a168501f3c7bb15d1802245f6e66de565f18bbb9b5eaa", size = 999550, upload-time = "2025-08-12T06:59:20.844Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/33/dc5b54042050d2dda4229c3ce1f862541c99966390b6aa20f54d520d2dc2/sentencepiece-0.2.1-cp310-cp310-win_amd64.whl", hash = "sha256:e52144670738b4b477fade6c2a9b6af71a8d0094514c9853ac9f6fc1fcfabae7", size = 1054613, upload-time = "2025-08-12T06:59:22.255Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/19/1ea47f46ff97fe04422b78997da1a37cd632f414aae042d27a9009c5b733/sentencepiece-0.2.1-cp310-cp310-win_arm64.whl", hash = "sha256:9076430ac25dfa7147d9d05751dbc66a04bc1aaac371c07f84952979ea59f0d0", size = 1033884, upload-time = "2025-08-12T06:59:24.194Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/15/46afbab00733d81788b64be430ca1b93011bb9388527958e26cc31832de5/sentencepiece-0.2.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6356d0986b8b8dc351b943150fcd81a1c6e6e4d439772e8584c64230e58ca987", size = 1942560, upload-time = "2025-08-12T06:59:25.82Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/79/7c01b8ef98a0567e9d84a4e7a910f8e7074fcbf398a5cd76f93f4b9316f9/sentencepiece-0.2.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8f8ba89a3acb3dc1ae90f65ec1894b0b9596fdb98ab003ff38e058f898b39bc7", size = 1325385, upload-time = "2025-08-12T06:59:27.722Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/88/2b41e07bd24f33dcf2f18ec3b74247aa4af3526bad8907b8727ea3caba03/sentencepiece-0.2.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:02593eca45440ef39247cee8c47322a34bdcc1d8ae83ad28ba5a899a2cf8d79a", size = 1253319, upload-time = "2025-08-12T06:59:29.306Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/54/38a1af0c6210a3c6f95aa46d23d6640636d020fba7135cd0d9a84ada05a7/sentencepiece-0.2.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0a0d15781a171d188b661ae4bde1d998c303f6bd8621498c50c671bd45a4798e", size = 1316162, upload-time = "2025-08-12T06:59:30.914Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/66/fb191403ade791ad2c3c1e72fe8413e63781b08cfa3aa4c9dfc536d6e795/sentencepiece-0.2.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4f5a3e0d9f445ed9d66c0fec47d4b23d12cfc858b407a03c194c1b26c2ac2a63", size = 1387785, upload-time = "2025-08-12T06:59:32.491Z" },
+    { url = "https://files.pythonhosted.org/packages/a9/2d/3bd9b08e70067b2124518b308db6a84a4f8901cc8a4317e2e4288cdd9b4d/sentencepiece-0.2.1-cp311-cp311-win32.whl", hash = "sha256:6d297a1748d429ba8534eebe5535448d78b8acc32d00a29b49acf28102eeb094", size = 999555, upload-time = "2025-08-12T06:59:34.475Z" },
+    { url = "https://files.pythonhosted.org/packages/32/b8/f709977f5fda195ae1ea24f24e7c581163b6f142b1005bc3d0bbfe4d7082/sentencepiece-0.2.1-cp311-cp311-win_amd64.whl", hash = "sha256:82d9ead6591015f009cb1be1cb1c015d5e6f04046dbb8c9588b931e869a29728", size = 1054617, upload-time = "2025-08-12T06:59:36.461Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/40/a1fc23be23067da0f703709797b464e8a30a1c78cc8a687120cd58d4d509/sentencepiece-0.2.1-cp311-cp311-win_arm64.whl", hash = "sha256:39f8651bd10974eafb9834ce30d9bcf5b73e1fc798a7f7d2528f9820ca86e119", size = 1033877, upload-time = "2025-08-12T06:59:38.391Z" },
     { url = "https://files.pythonhosted.org/packages/4a/be/32ce495aa1d0e0c323dcb1ba87096037358edee539cac5baf8755a6bd396/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:57cae326c8727de58c85977b175af132a7138d84c764635d7e71bbee7e774133", size = 1943152, upload-time = "2025-08-12T06:59:40.048Z" },
     { url = "https://files.pythonhosted.org/packages/88/7e/ff23008899a58678e98c6ff592bf4d368eee5a71af96d0df6b38a039dd4f/sentencepiece-0.2.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:56dd39a3c4d6493db3cdca7e8cc68c6b633f0d4195495cbadfcf5af8a22d05a6", size = 1325651, upload-time = "2025-08-12T06:59:41.536Z" },
     { url = "https://files.pythonhosted.org/packages/19/84/42eb3ce4796777a1b5d3699dfd4dca85113e68b637f194a6c8d786f16a04/sentencepiece-0.2.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d9381351182ff9888cc80e41c632e7e274b106f450de33d67a9e8f6043da6f76", size = 1253645, upload-time = "2025-08-12T06:59:42.903Z" },
@@ -5974,15 +7248,15 @@ wheels = [
 
 [[package]]
 name = "sentry-sdk"
-version = "2.58.0"
+version = "2.57.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "certifi" },
     { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/26/b3/fb8291170d0e844173164709fc0fa0c221ed75a5da740c8746f2a83b4eb1/sentry_sdk-2.58.0.tar.gz", hash = "sha256:c1144d947352d54e5b7daa63596d9f848adf684989c06c4f5a659f0c85a18f6f", size = 438764, upload-time = "2026-04-13T17:23:26.265Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/4f/87/46c0406d8b5ddd026f73adaf5ab75ce144219c41a4830b52df4b9ab55f7f/sentry_sdk-2.57.0.tar.gz", hash = "sha256:4be8d1e71c32fb27f79c577a337ac8912137bba4bcbc64a4ec1da4d6d8dc5199", size = 435288, upload-time = "2026-03-31T09:39:29.264Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/eb/d875669993b762556ae8b2efd86219943b4c0864d22204d622a9aee3052b/sentry_sdk-2.58.0-py2.py3-none-any.whl", hash = "sha256:688d1c704ddecf382ea3326f21a67453d4caa95592d722b7c780a36a9d23109e", size = 460919, upload-time = "2026-04-13T17:23:24.675Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/64/982e07b93219cb52e1cca5d272cb579e2f3eb001956c9e7a9a6d106c9473/sentry_sdk-2.57.0-py2.py3-none-any.whl", hash = "sha256:812c8bf5ff3d2f0e89c82f5ce80ab3a6423e102729c4706af7413fd1eb480585", size = 456489, upload-time = "2026-03-31T09:39:27.524Z" },
 ]
 
 [[package]]
@@ -5991,6 +7265,26 @@ version = "1.3.7"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/8d/48/49393a96a2eef1ab418b17475fb92b8fcfad83d099e678751b05472e69de/setproctitle-1.3.7.tar.gz", hash = "sha256:bc2bc917691c1537d5b9bca1468437176809c7e11e5694ca79a9ca12345dcb9e", size = 27002, upload-time = "2025-09-05T12:51:25.278Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/48/fb401ec8c4953d519d05c87feca816ad668b8258448ff60579ac7a1c1386/setproctitle-1.3.7-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cf555b6299f10a6eb44e4f96d2f5a3884c70ce25dc5c8796aaa2f7b40e72cb1b", size = 18079, upload-time = "2025-09-05T12:49:07.732Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/a3/c2b0333c2716fb3b4c9a973dd113366ac51b4f8d56b500f4f8f704b4817a/setproctitle-1.3.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:690b4776f9c15aaf1023bb07d7c5b797681a17af98a4a69e76a1d504e41108b7", size = 13099, upload-time = "2025-09-05T12:49:09.222Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/f8/17bda581c517678260e6541b600eeb67745f53596dc077174141ba2f6702/setproctitle-1.3.7-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:00afa6fc507967d8c9d592a887cdc6c1f5742ceac6a4354d111ca0214847732c", size = 31793, upload-time = "2025-09-05T12:49:10.297Z" },
+    { url = "https://files.pythonhosted.org/packages/27/d1/76a33ae80d4e788ecab9eb9b53db03e81cfc95367ec7e3fbf4989962fedd/setproctitle-1.3.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9e02667f6b9fc1238ba753c0f4b0a37ae184ce8f3bbbc38e115d99646b3f4cd3", size = 32779, upload-time = "2025-09-05T12:49:12.157Z" },
+    { url = "https://files.pythonhosted.org/packages/59/27/1a07c38121967061564f5e0884414a5ab11a783260450172d4fc68c15621/setproctitle-1.3.7-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:83fcd271567d133eb9532d3b067c8a75be175b2b3b271e2812921a05303a693f", size = 34578, upload-time = "2025-09-05T12:49:13.393Z" },
+    { url = "https://files.pythonhosted.org/packages/d8/d4/725e6353935962d8bb12cbf7e7abba1d0d738c7f6935f90239d8e1ccf913/setproctitle-1.3.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:13fe37951dda1a45c35d77d06e3da5d90e4f875c4918a7312b3b4556cfa7ff64", size = 32030, upload-time = "2025-09-05T12:49:15.362Z" },
+    { url = "https://files.pythonhosted.org/packages/67/24/e4677ae8e1cb0d549ab558b12db10c175a889be0974c589c428fece5433e/setproctitle-1.3.7-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a05509cfb2059e5d2ddff701d38e474169e9ce2a298cf1b6fd5f3a213a553fe5", size = 33363, upload-time = "2025-09-05T12:49:16.829Z" },
+    { url = "https://files.pythonhosted.org/packages/55/d4/69ce66e4373a48fdbb37489f3ded476bb393e27f514968c3a69a67343ae0/setproctitle-1.3.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:6da835e76ae18574859224a75db6e15c4c2aaa66d300a57efeaa4c97ca4c7381", size = 31508, upload-time = "2025-09-05T12:49:18.032Z" },
+    { url = "https://files.pythonhosted.org/packages/4b/5a/42c1ed0e9665d068146a68326529b5686a1881c8b9197c2664db4baf6aeb/setproctitle-1.3.7-cp310-cp310-win32.whl", hash = "sha256:9e803d1b1e20240a93bac0bc1025363f7f80cb7eab67dfe21efc0686cc59ad7c", size = 12558, upload-time = "2025-09-05T12:49:19.742Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/fe/dd206cc19a25561921456f6cb12b405635319299b6f366e0bebe872abc18/setproctitle-1.3.7-cp310-cp310-win_amd64.whl", hash = "sha256:a97200acc6b64ec4cada52c2ecaf1fba1ef9429ce9c542f8a7db5bcaa9dcbd95", size = 13245, upload-time = "2025-09-05T12:49:21.023Z" },
+    { url = "https://files.pythonhosted.org/packages/04/cd/1b7ba5cad635510720ce19d7122154df96a2387d2a74217be552887c93e5/setproctitle-1.3.7-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:a600eeb4145fb0ee6c287cb82a2884bd4ec5bbb076921e287039dcc7b7cc6dd0", size = 18085, upload-time = "2025-09-05T12:49:22.183Z" },
+    { url = "https://files.pythonhosted.org/packages/8f/1a/b2da0a620490aae355f9d72072ac13e901a9fec809a6a24fc6493a8f3c35/setproctitle-1.3.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:97a090fed480471bb175689859532709e28c085087e344bca45cf318034f70c4", size = 13097, upload-time = "2025-09-05T12:49:23.322Z" },
+    { url = "https://files.pythonhosted.org/packages/18/2e/bd03ff02432a181c1787f6fc2a678f53b7dacdd5ded69c318fe1619556e8/setproctitle-1.3.7-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1607b963e7b53e24ec8a2cb4e0ab3ae591d7c6bf0a160feef0551da63452b37f", size = 32191, upload-time = "2025-09-05T12:49:24.567Z" },
+    { url = "https://files.pythonhosted.org/packages/28/78/1e62fc0937a8549f2220445ed2175daacee9b6764c7963b16148119b016d/setproctitle-1.3.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a20fb1a3974e2dab857870cf874b325b8705605cb7e7e8bcbb915bca896f52a9", size = 33203, upload-time = "2025-09-05T12:49:25.871Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/3c/65edc65db3fa3df400cf13b05e9d41a3c77517b4839ce873aa6b4043184f/setproctitle-1.3.7-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f8d961bba676e07d77665204f36cffaa260f526e7b32d07ab3df6a2c1dfb44ba", size = 34963, upload-time = "2025-09-05T12:49:27.044Z" },
+    { url = "https://files.pythonhosted.org/packages/a1/32/89157e3de997973e306e44152522385f428e16f92f3cf113461489e1e2ee/setproctitle-1.3.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:db0fd964fbd3a9f8999b502f65bd2e20883fdb5b1fae3a424e66db9a793ed307", size = 32398, upload-time = "2025-09-05T12:49:28.909Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/18/77a765a339ddf046844cb4513353d8e9dcd8183da9cdba6e078713e6b0b2/setproctitle-1.3.7-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:db116850fcf7cca19492030f8d3b4b6e231278e8fe097a043957d22ce1bdf3ee", size = 33657, upload-time = "2025-09-05T12:49:30.323Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/63/f0b6205c64d74d2a24a58644a38ec77bdbaa6afc13747e75973bf8904932/setproctitle-1.3.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:316664d8b24a5c91ee244460bdaf7a74a707adaa9e14fbe0dc0a53168bb9aba1", size = 31836, upload-time = "2025-09-05T12:49:32.309Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/51/e1277f9ba302f1a250bbd3eedbbee747a244b3cc682eb58fb9733968f6d8/setproctitle-1.3.7-cp311-cp311-win32.whl", hash = "sha256:b74774ca471c86c09b9d5037c8451fff06bb82cd320d26ae5a01c758088c0d5d", size = 12556, upload-time = "2025-09-05T12:49:33.529Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/7b/822a23f17e9003dfdee92cd72758441ca2a3680388da813a371b716fb07f/setproctitle-1.3.7-cp311-cp311-win_amd64.whl", hash = "sha256:acb9097213a8dd3410ed9f0dc147840e45ca9797785272928d4be3f0e69e3be4", size = 13243, upload-time = "2025-09-05T12:49:34.553Z" },
     { url = "https://files.pythonhosted.org/packages/fb/f0/2dc88e842077719d7384d86cc47403e5102810492b33680e7dadcee64cd8/setproctitle-1.3.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2dc99aec591ab6126e636b11035a70991bc1ab7a261da428491a40b84376654e", size = 18049, upload-time = "2025-09-05T12:49:36.241Z" },
     { url = "https://files.pythonhosted.org/packages/f0/b4/50940504466689cda65680c9e9a1e518e5750c10490639fa687489ac7013/setproctitle-1.3.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:cdd8aa571b7aa39840fdbea620e308a19691ff595c3a10231e9ee830339dd798", size = 13079, upload-time = "2025-09-05T12:49:38.088Z" },
     { url = "https://files.pythonhosted.org/packages/d0/99/71630546b9395b095f4082be41165d1078204d1696c2d9baade3de3202d0/setproctitle-1.3.7-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2906b6c7959cdb75f46159bf0acd8cc9906cf1361c9e1ded0d065fe8f9039629", size = 32932, upload-time = "2025-09-05T12:49:39.271Z" },
@@ -6041,6 +7335,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e7/e3/54b496ac724e60e61cc3447f02690105901ca6d90da0377dffe49ff99fc7/setproctitle-1.3.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:1fae595d032b30dab4d659bece20debd202229fce12b55abab978b7f30783d73", size = 33958, upload-time = "2025-09-05T12:50:39.841Z" },
     { url = "https://files.pythonhosted.org/packages/ea/a8/c84bb045ebf8c6fdc7f7532319e86f8380d14bbd3084e6348df56bdfe6fd/setproctitle-1.3.7-cp314-cp314t-win32.whl", hash = "sha256:02432f26f5d1329ab22279ff863c83589894977063f59e6c4b4845804a08f8c2", size = 12745, upload-time = "2025-09-05T12:50:41.377Z" },
     { url = "https://files.pythonhosted.org/packages/08/b6/3a5a4f9952972791a9114ac01dfc123f0df79903577a3e0a7a404a695586/setproctitle-1.3.7-cp314-cp314t-win_amd64.whl", hash = "sha256:cbc388e3d86da1f766d8fc2e12682e446064c01cea9f88a88647cfe7c011de6a", size = 13469, upload-time = "2025-09-05T12:50:42.67Z" },
+    { url = "https://files.pythonhosted.org/packages/34/8a/aff5506ce89bc3168cb492b18ba45573158d528184e8a9759a05a09088a9/setproctitle-1.3.7-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:eb440c5644a448e6203935ed60466ec8d0df7278cd22dc6cf782d07911bcbea6", size = 12654, upload-time = "2025-09-05T12:51:17.141Z" },
+    { url = "https://files.pythonhosted.org/packages/41/89/5b6f2faedd6ced3d3c085a5efbd91380fb1f61f4c12bc42acad37932f4e9/setproctitle-1.3.7-pp310-pypy310_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:502b902a0e4c69031b87870ff4986c290ebbb12d6038a70639f09c331b18efb2", size = 14284, upload-time = "2025-09-05T12:51:18.393Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/c0/4312fed3ca393a29589603fd48f17937b4ed0638b923bac75a728382e730/setproctitle-1.3.7-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f6f268caeabb37ccd824d749e7ce0ec6337c4ed954adba33ec0d90cc46b0ab78", size = 13282, upload-time = "2025-09-05T12:51:19.703Z" },
+    { url = "https://files.pythonhosted.org/packages/c3/5b/5e1c117ac84e3cefcf8d7a7f6b2461795a87e20869da065a5c087149060b/setproctitle-1.3.7-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:b1cac6a4b0252b8811d60b6d8d0f157c0fdfed379ac89c25a914e6346cf355a1", size = 12587, upload-time = "2025-09-05T12:51:21.195Z" },
+    { url = "https://files.pythonhosted.org/packages/73/02/b9eadc226195dcfa90eed37afe56b5dd6fa2f0e5220ab8b7867b8862b926/setproctitle-1.3.7-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f1704c9e041f2b1dc38f5be4552e141e1432fba3dd52c72eeffd5bc2db04dc65", size = 14286, upload-time = "2025-09-05T12:51:22.61Z" },
+    { url = "https://files.pythonhosted.org/packages/28/26/1be1d2a53c2a91ec48fa2ff4a409b395f836798adf194d99de9c059419ea/setproctitle-1.3.7-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:b08b61976ffa548bd5349ce54404bf6b2d51bd74d4f1b241ed1b0f25bce09c3a", size = 13282, upload-time = "2025-09-05T12:51:24.094Z" },
 ]
 
 [[package]]
@@ -6081,14 +7381,14 @@ wheels = [
 
 [[package]]
 name = "smart-open"
-version = "7.6.0"
+version = "7.5.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "wrapt" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/14/33/7a00ac9b4a63afb4279b99a766f6cbe56c443526dcbf5db97b219e21fde9/smart_open-7.6.0.tar.gz", hash = "sha256:44717f46b5ff276fac03b88e5d13d1c416f064f3b7b081381b0fa8889004bd7e", size = 54548, upload-time = "2026-04-13T09:48:04.347Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/be/a66598b305763861a9ab15ff0f2fbc44e47b1ce7a776797337a4eef37c66/smart_open-7.5.1.tar.gz", hash = "sha256:3f08e16827c4733699e6b2cc40328a3568f900cb12ad9a3ad233ba6c872d9fe7", size = 54034, upload-time = "2026-02-23T11:01:28.979Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/de/bc/2761410d0541e975f384bc89f062d716bf119499dd097eb1af33dcd3b1c0/smart_open-7.6.0-py3-none-any.whl", hash = "sha256:2a78f454610a826aa688065b54b4a0a9b12a5599fa61d5190e9bac2df5e5f53f", size = 64591, upload-time = "2026-04-13T09:48:02.687Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/ea/dcdecd68acebb49d3fd560473a43499b1635076f7f1ae8641c060fe7ce74/smart_open-7.5.1-py3-none-any.whl", hash = "sha256:3e07cbbd9c8a908bcb8e25d48becf1a5cbb4886fa975e9f34c672ed171df2318", size = 64108, upload-time = "2026-02-23T11:01:27.429Z" },
 ]
 
 [[package]]
@@ -6133,8 +7433,8 @@ version = "0.13.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cffi" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156, upload-time = "2025-01-25T09:17:04.831Z" }
 wheels = [
@@ -6158,44 +7458,257 @@ wheels = [
 
 [[package]]
 name = "sphinx"
-version = "9.1.0"
+version = "8.1.3"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
 dependencies = [
-    { name = "alabaster" },
-    { name = "babel" },
-    { name = "colorama", marker = "sys_platform == 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "docutils" },
-    { name = "imagesize" },
-    { name = "jinja2" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
-    { name = "pygments" },
-    { name = "requests" },
-    { name = "roman-numerals" },
-    { name = "snowballstemmer" },
-    { name = "sphinxcontrib-applehelp" },
-    { name = "sphinxcontrib-devhelp" },
-    { name = "sphinxcontrib-htmlhelp" },
-    { name = "sphinxcontrib-jsmath" },
-    { name = "sphinxcontrib-qthelp" },
-    { name = "sphinxcontrib-serializinghtml" },
+    { name = "alabaster", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "babel", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "colorama", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (python_full_version >= '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "docutils", version = "0.21.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "imagesize", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pygments", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "requests", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "snowballstemmer", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/6f/6d/be0b61178fe2cdcb67e2a92fc9ebb488e3c51c4f74a36a7824c0adf23425/sphinx-8.1.3.tar.gz", hash = "sha256:43c1911eecb0d3e161ad78611bc905d1ad0e523e4ddc202a58a821773dc4c927", size = 8184611, upload-time = "2024-10-13T20:27:13.93Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/26/60/1ddff83a56d33aaf6f10ec8ce84b4c007d9368b21008876fceda7e7381ef/sphinx-8.1.3-py3-none-any.whl", hash = "sha256:09719015511837b76bf6e03e42eb7595ac8c2e41eeb9c29c5b755c6b677992a2", size = 3487125, upload-time = "2024-10-13T20:27:10.448Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.0.4"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+dependencies = [
+    { name = "alabaster", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "babel", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "colorama", marker = "(python_full_version == '3.11.*' and sys_platform == 'win32') or (python_full_version != '3.11.*' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "imagesize", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pygments", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "requests", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "roman-numerals", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "snowballstemmer", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/42/50/a8c6ccc36d5eacdfd7913ddccd15a9cee03ecafc5ee2bc40e1f168d85022/sphinx-9.0.4.tar.gz", hash = "sha256:594ef59d042972abbc581d8baa577404abe4e6c3b04ef61bd7fc2acbd51f3fa3", size = 8710502, upload-time = "2025-12-04T07:45:27.343Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c6/3f/4bbd76424c393caead2e1eb89777f575dee5c8653e2d4b6afd7a564f5974/sphinx-9.0.4-py3-none-any.whl", hash = "sha256:5bebc595a5e943ea248b99c13814c1c5e10b3ece718976824ffa7959ff95fffb", size = 3917713, upload-time = "2025-12-04T07:45:24.944Z" },
+]
+
+[[package]]
+name = "sphinx"
+version = "9.1.0"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+dependencies = [
+    { name = "alabaster", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "babel", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "colorama", marker = "(python_full_version >= '3.12' and sys_platform == 'win32') or (python_full_version < '3.12' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "docutils", version = "0.22.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "imagesize", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "jinja2", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "pygments", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "requests", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "roman-numerals", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "snowballstemmer", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-applehelp", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-devhelp", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-htmlhelp", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-jsmath", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-qthelp", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinxcontrib-serializinghtml", marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/cd/bd/f08eb0f4eed5c83f1ba2a3bd18f7745a2b1525fad70660a1c00224ec468a/sphinx-9.1.0.tar.gz", hash = "sha256:7741722357dd75f8190766926071fed3bdc211c74dd2d7d4df5404da95930ddb", size = 8718324, upload-time = "2025-12-31T15:09:27.646Z" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/73/f7/b1884cb3188ab181fc81fa00c266699dab600f927a964df02ec3d5d1916a/sphinx-9.1.0-py3-none-any.whl", hash = "sha256:c84fdd4e782504495fe4f2c0b3413d6c2bf388589bb352d439b2a3bb99991978", size = 3921742, upload-time = "2025-12-31T15:09:25.561Z" },
 ]
 
+[[package]]
+name = "sphinx-autobuild"
+version = "2024.10.3"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine != 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and platform_machine == 's390x' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version < '3.11' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
+dependencies = [
+    { name = "colorama", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "starlette", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "uvicorn", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "watchfiles", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "websockets", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/a5/2c/155e1de2c1ba96a72e5dba152c509a8b41e047ee5c2def9e9f0d812f8be7/sphinx_autobuild-2024.10.3.tar.gz", hash = "sha256:248150f8f333e825107b6d4b86113ab28fa51750e5f9ae63b59dc339be951fb1", size = 14023, upload-time = "2024-10-02T23:15:30.172Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/18/c0/eba125db38c84d3c74717008fd3cb5000b68cd7e2cbafd1349c6a38c3d3b/sphinx_autobuild-2024.10.3-py3-none-any.whl", hash = "sha256:158e16c36f9d633e613c9aaf81c19b0fc458ca78b112533b20dafcda430d60fa", size = 11908, upload-time = "2024-10-02T23:15:28.739Z" },
+]
+
 [[package]]
 name = "sphinx-autobuild"
 version = "2025.8.25"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version >= '3.14' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.13.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.12.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform == 'emscripten' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+    "python_full_version == '3.11.*' and sys_platform != 'emscripten' and sys_platform != 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts'",
+]
 dependencies = [
-    { name = "colorama" },
-    { name = "sphinx" },
-    { name = "starlette" },
-    { name = "uvicorn" },
-    { name = "watchfiles" },
-    { name = "websockets" },
+    { name = "colorama", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "starlette", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "uvicorn", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "watchfiles", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "websockets", marker = "python_full_version >= '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/e0/3c/a59a3a453d4133777f7ed2e83c80b7dc817d43c74b74298ca0af869662ad/sphinx_autobuild-2025.8.25.tar.gz", hash = "sha256:9cf5aab32853c8c31af572e4fecdc09c997e2b8be5a07daf2a389e270e85b213", size = 15200, upload-time = "2025-08-25T18:44:55.436Z" }
 wheels = [
@@ -6208,6 +7721,7 @@ version = "0.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "astroid" },
+    { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "typing-extensions" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/17/5f/5350046d1aa1a56b063ae08b9ad871025335c9d55fe2372896ea48711da9/sphinx_autodoc2-0.5.0.tar.gz", hash = "sha256:7d76044aa81d6af74447080182b6868c7eb066874edc835e8ddf810735b6565a", size = 115077, upload-time = "2023-11-27T07:27:51.407Z" }
@@ -6220,7 +7734,9 @@ name = "sphinx-copybutton"
 version = "0.5.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "sphinx" },
+    { name = "sphinx", version = "8.1.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.0.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version == '3.11.*' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "sphinx", version = "9.1.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.12' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/fc/2b/a964715e7f5295f77509e59309959f4125122d648f86b4fe7d70ca1d882c/sphinx-copybutton-0.5.2.tar.gz", hash = "sha256:4cf17c82fb9646d1bc9ca92ac280813a3b605d8c421225fd9913154103ee1fbd", size = 23039, upload-time = "2023-04-14T08:10:22.998Z" }
 wheels = [
@@ -6291,6 +7807,20 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/09/45/461788f35e0364a8da7bda51a1fe1b09762d0c32f12f63727998d85a873b/sqlalchemy-2.0.49.tar.gz", hash = "sha256:d15950a57a210e36dd4cec1aac22787e2a4d57ba9318233e2ef8b2daf9ff2d5f", size = 9898221, upload-time = "2026-04-03T16:38:11.704Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/96/76/f908955139842c362aa877848f42f9249642d5b69e06cee9eae5111da1bd/sqlalchemy-2.0.49-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:42e8804962f9e6f4be2cbaedc0c3718f08f60a16910fa3d86da5a1e3b1bfe60f", size = 2159321, upload-time = "2026-04-03T16:50:11.8Z" },
+    { url = "https://files.pythonhosted.org/packages/24/e2/17ba0b7bfbd8de67196889b6d951de269e8a46057d92baca162889beb16d/sqlalchemy-2.0.49-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cc992c6ed024c8c3c592c5fc9846a03dd68a425674900c70122c77ea16c5fb0b", size = 3238937, upload-time = "2026-04-03T16:54:45.731Z" },
+    { url = "https://files.pythonhosted.org/packages/90/1e/410dd499c039deacff395eec01a9da057125fcd0c97e3badc252c6a2d6a7/sqlalchemy-2.0.49-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6eb188b84269f357669b62cb576b5b918de10fb7c728a005fa0ebb0b758adce1", size = 3237188, upload-time = "2026-04-03T16:56:53.217Z" },
+    { url = "https://files.pythonhosted.org/packages/ab/06/e797a8b98a3993ac4bc785309b9b6d005457fc70238ee6cefa7c8867a92e/sqlalchemy-2.0.49-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:62557958002b69699bdb7f5137c6714ca1133f045f97b3903964f47db97ea339", size = 3190061, upload-time = "2026-04-03T16:54:47.489Z" },
+    { url = "https://files.pythonhosted.org/packages/44/d3/5a9f7ef580af1031184b38235da6ac58c3b571df01c9ec061c44b2b0c5a6/sqlalchemy-2.0.49-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:da9b91bca419dc9b9267ffadde24eae9b1a6bffcd09d0a207e5e3af99a03ce0d", size = 3211477, upload-time = "2026-04-03T16:56:55.056Z" },
+    { url = "https://files.pythonhosted.org/packages/69/ec/7be8c8cb35f038e963a203e4fe5a028989167cc7299927b7cf297c271e37/sqlalchemy-2.0.49-cp310-cp310-win32.whl", hash = "sha256:5e61abbec255be7b122aa461021daa7c3f310f3e743411a67079f9b3cc91ece3", size = 2119965, upload-time = "2026-04-03T17:00:50.009Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/31/0defb93e3a10b0cf7d1271aedd87251a08c3a597ee4f353281769b547b5a/sqlalchemy-2.0.49-cp310-cp310-win_amd64.whl", hash = "sha256:0c98c59075b890df8abfcc6ad632879540f5791c68baebacb4f833713b510e75", size = 2142935, upload-time = "2026-04-03T17:00:51.675Z" },
+    { url = "https://files.pythonhosted.org/packages/60/b5/e3617cc67420f8f403efebd7b043128f94775e57e5b84e7255203390ceae/sqlalchemy-2.0.49-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c5070135e1b7409c4161133aa525419b0062088ed77c92b1da95366ec5cbebbe", size = 2159126, upload-time = "2026-04-03T16:50:13.242Z" },
+    { url = "https://files.pythonhosted.org/packages/20/9b/91ca80403b17cd389622a642699e5f6564096b698e7cdcbcbb6409898bc4/sqlalchemy-2.0.49-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9ac7a3e245fd0310fd31495eb61af772e637bdf7d88ee81e7f10a3f271bff014", size = 3315509, upload-time = "2026-04-03T16:54:49.332Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/61/0722511d98c54de95acb327824cb759e8653789af2b1944ab1cc69d32565/sqlalchemy-2.0.49-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d4e5a0ceba319942fa6b585cf82539288a61e314ef006c1209f734551ab9536", size = 3315014, upload-time = "2026-04-03T16:56:56.376Z" },
+    { url = "https://files.pythonhosted.org/packages/46/55/d514a653ffeb4cebf4b54c47bec32ee28ad89d39fafba16eeed1d81dccd5/sqlalchemy-2.0.49-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3ddcb27fb39171de36e207600116ac9dfd4ae46f86c82a9bf3934043e80ebb88", size = 3267388, upload-time = "2026-04-03T16:54:51.272Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/16/0dcc56cb6d3335c1671a2258f5d2cb8267c9a2260e27fde53cbfb1b3540a/sqlalchemy-2.0.49-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:32fe6a41ad97302db2931f05bb91abbcc65b5ce4c675cd44b972428dd2947700", size = 3289602, upload-time = "2026-04-03T16:56:57.63Z" },
+    { url = "https://files.pythonhosted.org/packages/51/6c/f8ab6fb04470a133cd80608db40aa292e6bae5f162c3a3d4ab19544a67af/sqlalchemy-2.0.49-cp311-cp311-win32.whl", hash = "sha256:46d51518d53edfbe0563662c96954dc8fcace9832332b914375f45a99b77cc9a", size = 2119044, upload-time = "2026-04-03T17:00:53.455Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/59/55a6d627d04b6ebb290693681d7683c7da001eddf90b60cfcc41ee907978/sqlalchemy-2.0.49-cp311-cp311-win_amd64.whl", hash = "sha256:951d4a210744813be63019f3df343bf233b7432aadf0db54c75802247330d3af", size = 2143642, upload-time = "2026-04-03T17:00:54.769Z" },
     { url = "https://files.pythonhosted.org/packages/49/b3/2de412451330756aaaa72d27131db6dde23995efe62c941184e15242a5fa/sqlalchemy-2.0.49-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4bbccb45260e4ff1b7db0be80a9025bb1e6698bdb808b83fff0000f7a90b2c0b", size = 2157681, upload-time = "2026-04-03T16:53:07.132Z" },
     { url = "https://files.pythonhosted.org/packages/50/84/b2a56e2105bd11ebf9f0b93abddd748e1a78d592819099359aa98134a8bf/sqlalchemy-2.0.49-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:fb37f15714ec2652d574f021d479e78cd4eb9d04396dca36568fdfffb3487982", size = 3338976, upload-time = "2026-04-03T17:07:40Z" },
     { url = "https://files.pythonhosted.org/packages/2c/fa/65fcae2ed62f84ab72cf89536c7c3217a156e71a2c111b1305ab6f0690e2/sqlalchemy-2.0.49-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3bb9ec6436a820a4c006aad1ac351f12de2f2dbdaad171692ee457a02429b672", size = 3351937, upload-time = "2026-04-03T17:12:23.374Z" },
@@ -6374,6 +7904,19 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/99/55/db07de81b5c630da5cbf5c7df646580ca26dfaefa593667fc6f2fe016d2e/tabulate-0.10.0-py3-none-any.whl", hash = "sha256:f0b0622e567335c8fabaaa659f1b33bcb6ddfe2e496071b743aa113f8774f2d3", size = 39814, upload-time = "2026-03-04T18:55:31.284Z" },
 ]
 
+[[package]]
+name = "taskgroup"
+version = "0.2.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "exceptiongroup", marker = "python_full_version < '3.11'" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f0/8d/e218e0160cc1b692e6e0e5ba34e8865dbb171efeb5fc9a704544b3020605/taskgroup-0.2.2.tar.gz", hash = "sha256:078483ac3e78f2e3f973e2edbf6941374fbea81b9c5d0a96f51d297717f4752d", size = 11504, upload-time = "2025-01-03T09:24:13.761Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d1/b1/74babcc824a57904e919f3af16d86c08b524c0691504baf038ef2d7f655c/taskgroup-0.2.2-py2.py3-none-any.whl", hash = "sha256:e2c53121609f4ae97303e9ea1524304b4de6faf9eb2c9280c7f87976479a52fb", size = 14237, upload-time = "2025-01-03T09:24:11.41Z" },
+]
+
 [[package]]
 name = "tenacity"
 version = "9.1.4"
@@ -6391,10 +7934,9 @@ dependencies = [
     { name = "absl-py" },
     { name = "grpcio" },
     { name = "markdown" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging" },
     { name = "pillow" },
     { name = "protobuf" },
     { name = "setuptools" },
@@ -6420,6 +7962,8 @@ name = "tensorstore"
 version = "0.1.74"
 source = { registry = "https://pypi.org/simple" }
 resolution-markers = [
+    "python_full_version < '3.11' and platform_machine != 's390x'",
+    "python_full_version < '3.11' and platform_machine == 's390x'",
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version >= '3.14' and platform_machine != 's390x' and sys_platform == 'emscripten'",
@@ -6434,11 +7978,22 @@ resolution-markers = [
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
 ]
 dependencies = [
-    { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.13'" },
+    { name = "ml-dtypes", version = "0.4.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "ml-dtypes", version = "0.5.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3c/b9/ea25aba62c688a87d7d7d9cc5926d602e2f9e84fa72586825486fb180b7e/tensorstore-0.1.74.tar.gz", hash = "sha256:a062875f27283d30ce4959c408c253ecb336fce8e3f9837c064e3d30cda79203", size = 6795605, upload-time = "2025-04-24T15:42:18.829Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/f2/20/1e7e776dc30f2f07416223c12f9ad244ec539af5fa1fbef9320812a9a3b6/tensorstore-0.1.74-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:edfae80aceb05640ac2209a11a4b76cecd5d9c4a95c01ede8c89c8edaa90f9d5", size = 15292660, upload-time = "2025-04-24T15:41:18.253Z" },
+    { url = "https://files.pythonhosted.org/packages/76/cc/81bf2d6a4caa239d38905b439864d3a8bf06b27d6d31bb2396e3f4f5cc55/tensorstore-0.1.74-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ab985d767d53e9478987c23dc7aea8f7e8aed2ef90ec8f7f939e8b399667feb1", size = 13260438, upload-time = "2025-04-24T15:41:22.596Z" },
+    { url = "https://files.pythonhosted.org/packages/88/4c/a26c4c8b8e7573d2b552505cd46a658b9a68a80d88e9d3c68f16d10e4d62/tensorstore-0.1.74-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d16d1181c292ea065ebd203e823420c65e365d0407eea8f0a3dd82995da0cc65", size = 17041531, upload-time = "2025-04-24T15:41:25.492Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/a9/3859b1b497dacf2093e196e1d4ed3b95e8553c7d7c9fe1f88216c72253a9/tensorstore-0.1.74-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f327e813152705b5297f251824a91106e17a06fd2f6b5f6e94c6401c5937da8c", size = 18392852, upload-time = "2025-04-24T15:41:28.136Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/3b/b7494ea0a37dd4cd3721f104fc52d4c953354b801eb1adf08e40bc08aaa0/tensorstore-0.1.74-cp310-cp310-win_amd64.whl", hash = "sha256:e56e9690cc20463951a52a6908e18056a93ce5bcd4a881834e2b5962801a1125", size = 12429998, upload-time = "2025-04-24T15:41:30.794Z" },
+    { url = "https://files.pythonhosted.org/packages/0d/3e/d67bb3d9bb7409469d15fb90ef5756e6ac8b835af7f27c02fc542c4b4059/tensorstore-0.1.74-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:8353e619d9140ca50fc0cb5b846e07c68462dd5015b4714752a0a664e48a03d3", size = 15294582, upload-time = "2025-04-24T15:41:33.794Z" },
+    { url = "https://files.pythonhosted.org/packages/01/f4/49cb5ea8e63303fcb0a6ebf0ed546aaec63982a4abca0e9801da5e3a24e3/tensorstore-0.1.74-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3ad1bfbb257ab84de1a5c9b79a60cebb5fbb7a411ddb1c246c21c9795789ba1", size = 13261395, upload-time = "2025-04-24T15:41:36.372Z" },
+    { url = "https://files.pythonhosted.org/packages/ad/7b/9c12d4687e6ff19222f12719286c13a546f1714e5dbed75d52a4267534ed/tensorstore-0.1.74-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f3ad9daf4c757db41ad091a1a5502807baeb848be0937986d8766049c39c8466", size = 17042621, upload-time = "2025-04-24T15:41:39.284Z" },
+    { url = "https://files.pythonhosted.org/packages/b5/07/cf0dc4540a78bc715fbcf4417c5dc708f3d12ed1664bf117f22463f411fc/tensorstore-0.1.74-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0a35364804e7d71bf5e86d2dae4de04c90249b61ff71448b9713b4e72b2389bd", size = 18393581, upload-time = "2025-04-24T15:41:42.554Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/42/edf004c5a101e021f052ea3564250d773d7cf6458f92934456ffa967383f/tensorstore-0.1.74-cp311-cp311-win_amd64.whl", hash = "sha256:15dcb6ce282e32d005caad34d595b0be070947578448a2861c63fdd608fc7394", size = 12431849, upload-time = "2025-04-24T15:41:45.263Z" },
     { url = "https://files.pythonhosted.org/packages/a1/14/2e6d1cad744af9e9a1a78d881a908a859ad95b61b15de10397069f55fbd8/tensorstore-0.1.74-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:7218722ee5d74e4d01f357917d3b1b7b1d6b1c068aa73e3d801cb3d58fc45116", size = 15334307, upload-time = "2025-04-24T15:41:48.315Z" },
     { url = "https://files.pythonhosted.org/packages/b2/ac/8d572b8c6d689eb50db0252e9d35ee6278a6aed481b64d7e025cf51e32c4/tensorstore-0.1.74-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a6926554a8633d0210bdba619d3996fff6a6af4214237fbca626e6ddfcc8ea39", size = 13288669, upload-time = "2025-04-24T15:41:50.808Z" },
     { url = "https://files.pythonhosted.org/packages/9d/6c/3e76d614ad70b61670686d91abaa3ddee6b01255bf2b40f050beb15b7970/tensorstore-0.1.74-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d584e468eb4ef8195f5d21a9da4780cf96c6074b87ef219b43a89efce3d503ca", size = 17031720, upload-time = "2025-04-24T15:41:55.092Z" },
@@ -6464,24 +8019,35 @@ resolution-markers = [
     "python_full_version >= '3.14' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'win32'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform == 'emscripten'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
     "python_full_version == '3.13.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
     "python_full_version == '3.13.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
-    "python_full_version < '3.13' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.12.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'win32'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform == 'emscripten'",
+    "python_full_version == '3.11.*' and platform_machine != 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "python_full_version == '3.11.*' and platform_machine == 's390x' and sys_platform != 'emscripten' and sys_platform != 'win32'",
 ]
 dependencies = [
-    { name = "ml-dtypes", version = "0.5.4", source = { registry = "https://pypi.org/simple" } },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "ml-dtypes", version = "0.5.4", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/cd/9b/43aedb544937f214dd7c665a7edf1b8b74f2f55d53ebd351c0ce69acf81a/tensorstore-0.1.82.tar.gz", hash = "sha256:ccfceffb7611fc61330f6da24b8b0abd9251d480ac8a5bac5a1729f9ed0c3a9f", size = 7160364, upload-time = "2026-03-13T00:22:16.888Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/5b/d2/66513f1782dc52425bda0d5f7baae94ea639bbd226650ecb000223cc9359/tensorstore-0.1.82-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:6ae87ae9baf7593b5c8d09dbdf3ee6969068833a6fd85317b781a4cf7cb7e533", size = 16555813, upload-time = "2026-03-13T00:21:24.802Z" },
+    { url = "https://files.pythonhosted.org/packages/04/4f/66a8af7dd6f5d8dabebe6edcdf0b87a06ac1f92318d972e9e6f5d3754b5d/tensorstore-0.1.82-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2471638a184473e384a6c3ffd98453b670a78372f2d3ed9707f27aebe5482c47", size = 14899141, upload-time = "2026-03-13T00:21:27.591Z" },
+    { url = "https://files.pythonhosted.org/packages/36/50/7a9840eb6c9ec52348dcadf8ef2dca7b2cb7d3ae25bafb672a236fd885f4/tensorstore-0.1.82-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:38eed3828101622552e63564d7a3a10b0cecb05f61d40e0f236b95f622a60897", size = 19339518, upload-time = "2026-03-13T00:21:29.885Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/5f/85b42d1173b0ebbd1c11879f8ff60a72d7f5bbc111255d2c685a33813f2a/tensorstore-0.1.82-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:aed5a6fc605e711c8a8dbd8ae73b919b8c6ca04ae94b0e0f6489fc54cdcab245", size = 20947623, upload-time = "2026-03-13T00:21:32.084Z" },
+    { url = "https://files.pythonhosted.org/packages/11/23/dcbd9ab116d58d3a1ed9686102592c032b7ffd558aa8626fff1c18701ccd/tensorstore-0.1.82-cp311-cp311-win_amd64.whl", hash = "sha256:afb825258329241341aa3e64293b64562df7812a02d5f6c6e4c9f731d0e34b0e", size = 13387579, upload-time = "2026-03-13T00:21:34.393Z" },
     { url = "https://files.pythonhosted.org/packages/0d/c3/5ab0b99487b2596bdc0ebd3a569e50415949a63bad90b18e6476de91a7bb/tensorstore-0.1.82-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:f0ac091bd47ea6f051fe11230ad2642c254b46a8fabdd5184b0600556b5529ed", size = 16570668, upload-time = "2026-03-13T00:21:36.386Z" },
     { url = "https://files.pythonhosted.org/packages/aa/95/92b00a4b2e6192528a9c5bac9f53007acf4aa5d54943b9e114bedb72b2da/tensorstore-0.1.82-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8cae7d0c9b2fa0653f90b147daaf9ed04664cab7d297b9772efcfa088da26cab", size = 14904517, upload-time = "2026-03-13T00:21:38.464Z" },
     { url = "https://files.pythonhosted.org/packages/46/7e/c9c8ad65ee4015787e32d31bcf8278fcb27109e809f8334a64285bd73028/tensorstore-0.1.82-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:34c491ea3c6c1904d4618bfe40020bd83aaeb19d52a266ea0f6919eb3fdc64c4", size = 19344428, upload-time = "2026-03-13T00:21:40.575Z" },
@@ -6551,6 +8117,20 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/89/b3/2cb7c17b6c4cf8ca983204255d3f1d95eda7213e247e6947a0ee2c747a2c/tiktoken-0.12.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3de02f5a491cfd179aec916eddb70331814bd6bf764075d39e21d5862e533970", size = 1051991, upload-time = "2025-10-06T20:21:34.098Z" },
+    { url = "https://files.pythonhosted.org/packages/27/0f/df139f1df5f6167194ee5ab24634582ba9a1b62c6b996472b0277ec80f66/tiktoken-0.12.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b6cfb6d9b7b54d20af21a912bfe63a2727d9cfa8fbda642fd8322c70340aad16", size = 995798, upload-time = "2025-10-06T20:21:35.579Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/5d/26a691f28ab220d5edc09b9b787399b130f24327ef824de15e5d85ef21aa/tiktoken-0.12.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:cde24cdb1b8a08368f709124f15b36ab5524aac5fa830cc3fdce9c03d4fb8030", size = 1129865, upload-time = "2025-10-06T20:21:36.675Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/94/443fab3d4e5ebecac895712abd3849b8da93b7b7dec61c7db5c9c7ebe40c/tiktoken-0.12.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:6de0da39f605992649b9cfa6f84071e3f9ef2cec458d08c5feb1b6f0ff62e134", size = 1152856, upload-time = "2025-10-06T20:21:37.873Z" },
+    { url = "https://files.pythonhosted.org/packages/54/35/388f941251b2521c70dd4c5958e598ea6d2c88e28445d2fb8189eecc1dfc/tiktoken-0.12.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:6faa0534e0eefbcafaccb75927a4a380463a2eaa7e26000f0173b920e98b720a", size = 1195308, upload-time = "2025-10-06T20:21:39.577Z" },
+    { url = "https://files.pythonhosted.org/packages/f8/00/c6681c7f833dd410576183715a530437a9873fa910265817081f65f9105f/tiktoken-0.12.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:82991e04fc860afb933efb63957affc7ad54f83e2216fe7d319007dab1ba5892", size = 1255697, upload-time = "2025-10-06T20:21:41.154Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/d2/82e795a6a9bafa034bf26a58e68fe9a89eeaaa610d51dbeb22106ba04f0a/tiktoken-0.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:6fb2995b487c2e31acf0a9e17647e3b242235a20832642bb7a9d1a181c0c1bb1", size = 879375, upload-time = "2025-10-06T20:21:43.201Z" },
+    { url = "https://files.pythonhosted.org/packages/de/46/21ea696b21f1d6d1efec8639c204bdf20fde8bafb351e1355c72c5d7de52/tiktoken-0.12.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6e227c7f96925003487c33b1b32265fad2fbcec2b7cf4817afb76d416f40f6bb", size = 1051565, upload-time = "2025-10-06T20:21:44.566Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/d9/35c5d2d9e22bb2a5f74ba48266fb56c63d76ae6f66e02feb628671c0283e/tiktoken-0.12.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:c06cf0fcc24c2cb2adb5e185c7082a82cba29c17575e828518c2f11a01f445aa", size = 995284, upload-time = "2025-10-06T20:21:45.622Z" },
+    { url = "https://files.pythonhosted.org/packages/01/84/961106c37b8e49b9fdcf33fe007bb3a8fdcc380c528b20cc7fbba80578b8/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:f18f249b041851954217e9fd8e5c00b024ab2315ffda5ed77665a05fa91f42dc", size = 1129201, upload-time = "2025-10-06T20:21:47.074Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/d0/3d9275198e067f8b65076a68894bb52fd253875f3644f0a321a720277b8a/tiktoken-0.12.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:47a5bc270b8c3db00bb46ece01ef34ad050e364b51d406b6f9730b64ac28eded", size = 1152444, upload-time = "2025-10-06T20:21:48.139Z" },
+    { url = "https://files.pythonhosted.org/packages/78/db/a58e09687c1698a7c592e1038e01c206569b86a0377828d51635561f8ebf/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:508fa71810c0efdcd1b898fda574889ee62852989f7c1667414736bcb2b9a4bd", size = 1195080, upload-time = "2025-10-06T20:21:49.246Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/1b/a9e4d2bf91d515c0f74afc526fd773a812232dd6cda33ebea7f531202325/tiktoken-0.12.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a1af81a6c44f008cba48494089dd98cccb8b313f55e961a52f5b222d1e507967", size = 1255240, upload-time = "2025-10-06T20:21:50.274Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/15/963819345f1b1fb0809070a79e9dd96938d4ca41297367d471733e79c76c/tiktoken-0.12.0-cp311-cp311-win_amd64.whl", hash = "sha256:3e68e3e593637b53e56f7237be560f7a394451cb8c11079755e80ae64b9e6def", size = 879422, upload-time = "2025-10-06T20:21:51.734Z" },
     { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728, upload-time = "2025-10-06T20:21:52.756Z" },
     { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049, upload-time = "2025-10-06T20:21:53.782Z" },
     { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008, upload-time = "2025-10-06T20:21:54.832Z" },
@@ -6612,6 +8192,10 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fd/18/a545c4ea42af3df6effd7d13d250ba77a0a86fb20393143bbb9a92e434d4/tokenizers-0.22.2-cp39-abi3-win32.whl", hash = "sha256:a6bf3f88c554a2b653af81f3204491c818ae2ac6fbc09e76ef4773351292bc92", size = 2502363, upload-time = "2026-01-05T10:45:20.593Z" },
     { url = "https://files.pythonhosted.org/packages/65/71/0670843133a43d43070abeb1949abfdef12a86d490bea9cd9e18e37c5ff7/tokenizers-0.22.2-cp39-abi3-win_amd64.whl", hash = "sha256:c9ea31edff2968b44a88f97d784c2f16dc0729b8b143ed004699ebca91f05c48", size = 2747786, upload-time = "2026-01-05T10:45:18.411Z" },
     { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
+    { url = "https://files.pythonhosted.org/packages/84/04/655b79dbcc9b3ac5f1479f18e931a344af67e5b7d3b251d2dcdcd7558592/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:753d47ebd4542742ef9261d9da92cd545b2cacbb48349a1225466745bb866ec4", size = 3282301, upload-time = "2026-01-05T10:40:34.858Z" },
+    { url = "https://files.pythonhosted.org/packages/46/cd/e4851401f3d8f6f45d8480262ab6a5c8cb9c4302a790a35aa14eeed6d2fd/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e10bf9113d209be7cd046d40fbabbaf3278ff6d18eb4da4c500443185dc1896c", size = 3161308, upload-time = "2026-01-05T10:40:40.737Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/6e/55553992a89982cd12d4a66dddb5e02126c58677ea3931efcbe601d419db/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:64d94e84f6660764e64e7e0b22baa72f6cd942279fdbb21d46abd70d179f0195", size = 3718964, upload-time = "2026-01-05T10:40:46.56Z" },
+    { url = "https://files.pythonhosted.org/packages/59/8c/b1c87148aa15e099243ec9f0cf9d0e970cc2234c3257d558c25a2c5304e6/tokenizers-0.22.2-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f01a9c019878532f98927d2bacb79bbb404b43d3437455522a00a30718cdedb5", size = 3373542, upload-time = "2026-01-05T10:40:52.803Z" },
 ]
 
 [[package]]
@@ -6623,6 +8207,60 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/44/6f/7120676b6d73228c96e17f1f794d8ab046fc910d781c8d151120c3f1569e/toml-0.10.2-py2.py3-none-any.whl", hash = "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", size = 16588, upload-time = "2020-11-01T01:40:20.672Z" },
 ]
 
+[[package]]
+name = "tomli"
+version = "2.4.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/22/de/48c59722572767841493b26183a0d1cc411d54fd759c5607c4590b6563a6/tomli-2.4.1.tar.gz", hash = "sha256:7c7e1a961a0b2f2472c1ac5b69affa0ae1132c39adcb67aba98568702b9cc23f", size = 17543, upload-time = "2026-03-25T20:22:03.828Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/f4/11/db3d5885d8528263d8adc260bb2d28ebf1270b96e98f0e0268d32b8d9900/tomli-2.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f8f0fc26ec2cc2b965b7a3b87cd19c5c6b8c5e5f436b984e85f486d652285c30", size = 154704, upload-time = "2026-03-25T20:21:10.473Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/f7/675db52c7e46064a9aa928885a9b20f4124ecb9bc2e1ce74c9106648d202/tomli-2.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4ab97e64ccda8756376892c53a72bd1f964e519c77236368527f758fbc36a53a", size = 149454, upload-time = "2026-03-25T20:21:12.036Z" },
+    { url = "https://files.pythonhosted.org/packages/61/71/81c50943cf953efa35bce7646caab3cf457a7d8c030b27cfb40d7235f9ee/tomli-2.4.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:96481a5786729fd470164b47cdb3e0e58062a496f455ee41b4403be77cb5a076", size = 237561, upload-time = "2026-03-25T20:21:13.098Z" },
+    { url = "https://files.pythonhosted.org/packages/48/c1/f41d9cb618acccca7df82aaf682f9b49013c9397212cb9f53219e3abac37/tomli-2.4.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5a881ab208c0baf688221f8cecc5401bd291d67e38a1ac884d6736cbcd8247e9", size = 243824, upload-time = "2026-03-25T20:21:14.569Z" },
+    { url = "https://files.pythonhosted.org/packages/22/e4/5a816ecdd1f8ca51fb756ef684b90f2780afc52fc67f987e3c61d800a46d/tomli-2.4.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47149d5bd38761ac8be13a84864bf0b7b70bc051806bc3669ab1cbc56216b23c", size = 242227, upload-time = "2026-03-25T20:21:15.712Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/49/2b2a0ef529aa6eec245d25f0c703e020a73955ad7edf73e7f54ddc608aa5/tomli-2.4.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ec9bfaf3ad2df51ace80688143a6a4ebc09a248f6ff781a9945e51937008fcbc", size = 247859, upload-time = "2026-03-25T20:21:17.001Z" },
+    { url = "https://files.pythonhosted.org/packages/83/bd/6c1a630eaca337e1e78c5903104f831bda934c426f9231429396ce3c3467/tomli-2.4.1-cp311-cp311-win32.whl", hash = "sha256:ff2983983d34813c1aeb0fa89091e76c3a22889ee83ab27c5eeb45100560c049", size = 97204, upload-time = "2026-03-25T20:21:18.079Z" },
+    { url = "https://files.pythonhosted.org/packages/42/59/71461df1a885647e10b6bb7802d0b8e66480c61f3f43079e0dcd315b3954/tomli-2.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:5ee18d9ebdb417e384b58fe414e8d6af9f4e7a0ae761519fb50f721de398dd4e", size = 108084, upload-time = "2026-03-25T20:21:18.978Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/83/dceca96142499c069475b790e7913b1044c1a4337e700751f48ed723f883/tomli-2.4.1-cp311-cp311-win_arm64.whl", hash = "sha256:c2541745709bad0264b7d4705ad453b76ccd191e64aa6f0fc66b69a293a45ece", size = 95285, upload-time = "2026-03-25T20:21:20.309Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/ba/42f134a3fe2b370f555f44b1d72feebb94debcab01676bf918d0cb70e9aa/tomli-2.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c742f741d58a28940ce01d58f0ab2ea3ced8b12402f162f4d534dfe18ba1cd6a", size = 155924, upload-time = "2026-03-25T20:21:21.626Z" },
+    { url = "https://files.pythonhosted.org/packages/dc/c7/62d7a17c26487ade21c5422b646110f2162f1fcc95980ef7f63e73c68f14/tomli-2.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7f86fd587c4ed9dd76f318225e7d9b29cfc5a9d43de44e5754db8d1128487085", size = 150018, upload-time = "2026-03-25T20:21:23.002Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/05/79d13d7c15f13bdef410bdd49a6485b1c37d28968314eabee452c22a7fda/tomli-2.4.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ff18e6a727ee0ab0388507b89d1bc6a22b138d1e2fa56d1ad494586d61d2eae9", size = 244948, upload-time = "2026-03-25T20:21:24.04Z" },
+    { url = "https://files.pythonhosted.org/packages/10/90/d62ce007a1c80d0b2c93e02cab211224756240884751b94ca72df8a875ca/tomli-2.4.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:136443dbd7e1dee43c68ac2694fde36b2849865fa258d39bf822c10e8068eac5", size = 253341, upload-time = "2026-03-25T20:21:25.177Z" },
+    { url = "https://files.pythonhosted.org/packages/1a/7e/caf6496d60152ad4ed09282c1885cca4eea150bfd007da84aea07bcc0a3e/tomli-2.4.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5e262d41726bc187e69af7825504c933b6794dc3fbd5945e41a79bb14c31f585", size = 248159, upload-time = "2026-03-25T20:21:26.364Z" },
+    { url = "https://files.pythonhosted.org/packages/99/e7/c6f69c3120de34bbd882c6fba7975f3d7a746e9218e56ab46a1bc4b42552/tomli-2.4.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5cb41aa38891e073ee49d55fbc7839cfdb2bc0e600add13874d048c94aadddd1", size = 253290, upload-time = "2026-03-25T20:21:27.46Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/2f/4a3c322f22c5c66c4b836ec58211641a4067364f5dcdd7b974b4c5da300c/tomli-2.4.1-cp312-cp312-win32.whl", hash = "sha256:da25dc3563bff5965356133435b757a795a17b17d01dbc0f42fb32447ddfd917", size = 98141, upload-time = "2026-03-25T20:21:28.492Z" },
+    { url = "https://files.pythonhosted.org/packages/24/22/4daacd05391b92c55759d55eaee21e1dfaea86ce5c571f10083360adf534/tomli-2.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:52c8ef851d9a240f11a88c003eacb03c31fc1c9c4ec64a99a0f922b93874fda9", size = 108847, upload-time = "2026-03-25T20:21:29.386Z" },
+    { url = "https://files.pythonhosted.org/packages/68/fd/70e768887666ddd9e9f5d85129e84910f2db2796f9096aa02b721a53098d/tomli-2.4.1-cp312-cp312-win_arm64.whl", hash = "sha256:f758f1b9299d059cc3f6546ae2af89670cb1c4d48ea29c3cacc4fe7de3058257", size = 95088, upload-time = "2026-03-25T20:21:30.677Z" },
+    { url = "https://files.pythonhosted.org/packages/07/06/b823a7e818c756d9a7123ba2cda7d07bc2dd32835648d1a7b7b7a05d848d/tomli-2.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:36d2bd2ad5fb9eaddba5226aa02c8ec3fa4f192631e347b3ed28186d43be6b54", size = 155866, upload-time = "2026-03-25T20:21:31.65Z" },
+    { url = "https://files.pythonhosted.org/packages/14/6f/12645cf7f08e1a20c7eb8c297c6f11d31c1b50f316a7e7e1e1de6e2e7b7e/tomli-2.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:eb0dc4e38e6a1fd579e5d50369aa2e10acfc9cace504579b2faabb478e76941a", size = 149887, upload-time = "2026-03-25T20:21:33.028Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/e0/90637574e5e7212c09099c67ad349b04ec4d6020324539297b634a0192b0/tomli-2.4.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7f2c7f2b9ca6bdeef8f0fa897f8e05085923eb091721675170254cbc5b02897", size = 243704, upload-time = "2026-03-25T20:21:34.51Z" },
+    { url = "https://files.pythonhosted.org/packages/10/8f/d3ddb16c5a4befdf31a23307f72828686ab2096f068eaf56631e136c1fdd/tomli-2.4.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f3c6818a1a86dd6dca7ddcaaf76947d5ba31aecc28cb1b67009a5877c9a64f3f", size = 251628, upload-time = "2026-03-25T20:21:36.012Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/f1/dbeeb9116715abee2485bf0a12d07a8f31af94d71608c171c45f64c0469d/tomli-2.4.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d312ef37c91508b0ab2cee7da26ec0b3ed2f03ce12bd87a588d771ae15dcf82d", size = 247180, upload-time = "2026-03-25T20:21:37.136Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/74/16336ffd19ed4da28a70959f92f506233bd7cfc2332b20bdb01591e8b1d1/tomli-2.4.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:51529d40e3ca50046d7606fa99ce3956a617f9b36380da3b7f0dd3dd28e68cb5", size = 251674, upload-time = "2026-03-25T20:21:38.298Z" },
+    { url = "https://files.pythonhosted.org/packages/16/f9/229fa3434c590ddf6c0aa9af64d3af4b752540686cace29e6281e3458469/tomli-2.4.1-cp313-cp313-win32.whl", hash = "sha256:2190f2e9dd7508d2a90ded5ed369255980a1bcdd58e52f7fe24b8162bf9fedbd", size = 97976, upload-time = "2026-03-25T20:21:39.316Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/1e/71dfd96bcc1c775420cb8befe7a9d35f2e5b1309798f009dca17b7708c1e/tomli-2.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:8d65a2fbf9d2f8352685bc1364177ee3923d6baf5e7f43ea4959d7d8bc326a36", size = 108755, upload-time = "2026-03-25T20:21:40.248Z" },
+    { url = "https://files.pythonhosted.org/packages/83/7a/d34f422a021d62420b78f5c538e5b102f62bea616d1d75a13f0a88acb04a/tomli-2.4.1-cp313-cp313-win_arm64.whl", hash = "sha256:4b605484e43cdc43f0954ddae319fb75f04cc10dd80d830540060ee7cd0243cd", size = 95265, upload-time = "2026-03-25T20:21:41.219Z" },
+    { url = "https://files.pythonhosted.org/packages/3c/fb/9a5c8d27dbab540869f7c1f8eb0abb3244189ce780ba9cd73f3770662072/tomli-2.4.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:fd0409a3653af6c147209d267a0e4243f0ae46b011aa978b1080359fddc9b6cf", size = 155726, upload-time = "2026-03-25T20:21:42.23Z" },
+    { url = "https://files.pythonhosted.org/packages/62/05/d2f816630cc771ad836af54f5001f47a6f611d2d39535364f148b6a92d6b/tomli-2.4.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a120733b01c45e9a0c34aeef92bf0cf1d56cfe81ed9d47d562f9ed591a9828ac", size = 149859, upload-time = "2026-03-25T20:21:43.386Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/48/66341bdb858ad9bd0ceab5a86f90eddab127cf8b046418009f2125630ecb/tomli-2.4.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:559db847dc486944896521f68d8190be1c9e719fced785720d2216fe7022b662", size = 244713, upload-time = "2026-03-25T20:21:44.474Z" },
+    { url = "https://files.pythonhosted.org/packages/df/6d/c5fad00d82b3c7a3ab6189bd4b10e60466f22cfe8a08a9394185c8a8111c/tomli-2.4.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01f520d4f53ef97964a240a035ec2a869fe1a37dde002b57ebc4417a27ccd853", size = 252084, upload-time = "2026-03-25T20:21:45.62Z" },
+    { url = "https://files.pythonhosted.org/packages/00/71/3a69e86f3eafe8c7a59d008d245888051005bd657760e96d5fbfb0b740c2/tomli-2.4.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7f94b27a62cfad8496c8d2513e1a222dd446f095fca8987fceef261225538a15", size = 247973, upload-time = "2026-03-25T20:21:46.937Z" },
+    { url = "https://files.pythonhosted.org/packages/67/50/361e986652847fec4bd5e4a0208752fbe64689c603c7ae5ea7cb16b1c0ca/tomli-2.4.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:ede3e6487c5ef5d28634ba3f31f989030ad6af71edfb0055cbbd14189ff240ba", size = 256223, upload-time = "2026-03-25T20:21:48.467Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/9a/b4173689a9203472e5467217e0154b00e260621caa227b6fa01feab16998/tomli-2.4.1-cp314-cp314-win32.whl", hash = "sha256:3d48a93ee1c9b79c04bb38772ee1b64dcf18ff43085896ea460ca8dec96f35f6", size = 98973, upload-time = "2026-03-25T20:21:49.526Z" },
+    { url = "https://files.pythonhosted.org/packages/14/58/640ac93bf230cd27d002462c9af0d837779f8773bc03dee06b5835208214/tomli-2.4.1-cp314-cp314-win_amd64.whl", hash = "sha256:88dceee75c2c63af144e456745e10101eb67361050196b0b6af5d717254dddf7", size = 109082, upload-time = "2026-03-25T20:21:50.506Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/2f/702d5e05b227401c1068f0d386d79a589bb12bf64c3d2c72ce0631e3bc49/tomli-2.4.1-cp314-cp314-win_arm64.whl", hash = "sha256:b8c198f8c1805dc42708689ed6864951fd2494f924149d3e4bce7710f8eb5232", size = 96490, upload-time = "2026-03-25T20:21:51.474Z" },
+    { url = "https://files.pythonhosted.org/packages/45/4b/b877b05c8ba62927d9865dd980e34a755de541eb65fffba52b4cc495d4d2/tomli-2.4.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:d4d8fe59808a54658fcc0160ecfb1b30f9089906c50b23bcb4c69eddc19ec2b4", size = 164263, upload-time = "2026-03-25T20:21:52.543Z" },
+    { url = "https://files.pythonhosted.org/packages/24/79/6ab420d37a270b89f7195dec5448f79400d9e9c1826df982f3f8e97b24fd/tomli-2.4.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7008df2e7655c495dd12d2a4ad038ff878d4ca4b81fccaf82b714e07eae4402c", size = 160736, upload-time = "2026-03-25T20:21:53.674Z" },
+    { url = "https://files.pythonhosted.org/packages/02/e0/3630057d8eb170310785723ed5adcdfb7d50cb7e6455f85ba8a3deed642b/tomli-2.4.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1d8591993e228b0c930c4bb0db464bdad97b3289fb981255d6c9a41aedc84b2d", size = 270717, upload-time = "2026-03-25T20:21:55.129Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/b4/1613716072e544d1a7891f548d8f9ec6ce2faf42ca65acae01d76ea06bb0/tomli-2.4.1-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:734e20b57ba95624ecf1841e72b53f6e186355e216e5412de414e3c51e5e3c41", size = 278461, upload-time = "2026-03-25T20:21:56.228Z" },
+    { url = "https://files.pythonhosted.org/packages/05/38/30f541baf6a3f6df77b3df16b01ba319221389e2da59427e221ef417ac0c/tomli-2.4.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8a650c2dbafa08d42e51ba0b62740dae4ecb9338eefa093aa5c78ceb546fcd5c", size = 274855, upload-time = "2026-03-25T20:21:57.653Z" },
+    { url = "https://files.pythonhosted.org/packages/77/a3/ec9dd4fd2c38e98de34223b995a3b34813e6bdadf86c75314c928350ed14/tomli-2.4.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:504aa796fe0569bb43171066009ead363de03675276d2d121ac1a4572397870f", size = 283144, upload-time = "2026-03-25T20:21:59.089Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/be/605a6261cac79fba2ec0c9827e986e00323a1945700969b8ee0b30d85453/tomli-2.4.1-cp314-cp314t-win32.whl", hash = "sha256:b1d22e6e9387bf4739fbe23bfa80e93f6b0373a7f1b96c6227c32bef95a4d7a8", size = 108683, upload-time = "2026-03-25T20:22:00.214Z" },
+    { url = "https://files.pythonhosted.org/packages/12/64/da524626d3b9cc40c168a13da8335fe1c51be12c0a63685cc6db7308daae/tomli-2.4.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2c1c351919aca02858f740c6d33adea0c5deea37f9ecca1cc1ef9e884a619d26", size = 121196, upload-time = "2026-03-25T20:22:01.169Z" },
+    { url = "https://files.pythonhosted.org/packages/5a/cd/e80b62269fc78fc36c9af5a6b89c835baa8af28ff5ad28c7028d60860320/tomli-2.4.1-cp314-cp314t-win_arm64.whl", hash = "sha256:eab21f45c7f66c13f2a9e0e1535309cee140182a9cdae1e041d02e47291e8396", size = 100393, upload-time = "2026-03-25T20:22:02.137Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/61/cceae43728b7de99d9b847560c262873a1f6c98202171fd5ed62640b494b/tomli-2.4.1-py3-none-any.whl", hash = "sha256:0d85819802132122da43cb86656f8d1f8c6587d54ae7dcaf30e90533028b49fe", size = 14583, upload-time = "2026-03-25T20:22:03.012Z" },
+]
+
 [[package]]
 name = "tomlkit"
 version = "0.14.0"
@@ -6639,21 +8277,29 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cuda-bindings", marker = "sys_platform == 'linux'" },
     { name = "cuda-toolkit", extra = ["cublas", "cudart", "cufft", "cufile", "cupti", "curand", "cusolver", "cusparse", "nvjitlink", "nvrtc", "nvtx"], marker = "sys_platform == 'linux' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "filelock" },
-    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "fsspec", version = "2026.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "jinja2" },
-    { name = "networkx" },
+    { name = "filelock", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
+    { name = "jinja2", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
+    { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "networkx", version = "3.6.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32') or (python_full_version < '3.11' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'emscripten' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform == 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" },
-    { name = "setuptools" },
-    { name = "sympy" },
+    { name = "setuptools", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
+    { name = "sympy", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
     { name = "triton", marker = "sys_platform == 'never'" },
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
 ]
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/ac/f2/c1690994afe461aae2d0cac62251e6802a703dec0a6c549c02ecd0de92a9/torch-2.11.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2c0d7fcfbc0c4e8bb5ebc3907cbc0c6a0da1b8f82b1fc6e14e914fa0b9baf74e", size = 80526521, upload-time = "2026-03-23T18:12:06.86Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/f0/98ae802fa8c09d3149b0c8690741f3f5753c90e779bd28c9613257295945/torch-2.11.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:4cf8687f4aec3900f748d553483ef40e0ac38411c3c48d0a86a438f6d7a99b18", size = 419723025, upload-time = "2026-03-23T18:11:43.774Z" },
+    { url = "https://files.pythonhosted.org/packages/f9/1e/18a9b10b4bd34f12d4e561c52b0ae7158707b8193c6cfc0aad2b48167090/torch-2.11.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:1b32ceda909818a03b112006709b02be1877240c31750a8d9c6b7bf5f2d8a6e5", size = 530589207, upload-time = "2026-03-23T18:11:23.756Z" },
+    { url = "https://files.pythonhosted.org/packages/35/40/2d532e8c0e23705be9d1debce5bc37b68d59a39bda7584c26fe9668076fe/torch-2.11.0-cp310-cp310-win_amd64.whl", hash = "sha256:b3c712ae6fb8e7a949051a953fc412fe0a6940337336c3b6f905e905dac5157f", size = 114518313, upload-time = "2026-03-23T18:11:58.281Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/0d/98b410492609e34a155fa8b121b55c7dca229f39636851c3a9ec20edea21/torch-2.11.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7b6a60d48062809f58595509c524b88e6ddec3ebe25833d6462eeab81e5f2ce4", size = 80529712, upload-time = "2026-03-23T18:12:02.608Z" },
+    { url = "https://files.pythonhosted.org/packages/84/03/acea680005f098f79fd70c1d9d5ccc0cb4296ec2af539a0450108232fc0c/torch-2.11.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:d91aac77f24082809d2c5a93f52a5f085032740a1ebc9252a7b052ef5a4fddc6", size = 419718178, upload-time = "2026-03-23T18:10:46.675Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/8b/d7be22fbec9ffee6cff31a39f8750d4b3a65d349a286cf4aec74c2375662/torch-2.11.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:7aa2f9bbc6d4595ba72138026b2074be1233186150e9292865e04b7a63b8c67a", size = 530604548, upload-time = "2026-03-23T18:10:03.569Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/bd/9912d30b68845256aabbb4a40aeefeef3c3b20db5211ccda653544ada4b6/torch-2.11.0-cp311-cp311-win_amd64.whl", hash = "sha256:73e24aaf8f36ab90d95cd1761208b2eb70841c2a9ca1a3f9061b39fc5331b708", size = 114519675, upload-time = "2026-03-23T18:11:52.995Z" },
     { url = "https://files.pythonhosted.org/packages/6f/8b/69e3008d78e5cee2b30183340cc425081b78afc5eff3d080daab0adda9aa/torch-2.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4b5866312ee6e52ea625cd211dcb97d6a2cdc1131a5f15cc0d87eec948f6dd34", size = 80606338, upload-time = "2026-03-23T18:11:34.781Z" },
     { url = "https://files.pythonhosted.org/packages/13/16/42e5915ebe4868caa6bac83a8ed59db57f12e9a61b7d749d584776ed53d5/torch-2.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f99924682ef0aa6a4ab3b1b76f40dc6e273fca09f367d15a524266db100a723f", size = 419731115, upload-time = "2026-03-23T18:11:06.944Z" },
     { url = "https://files.pythonhosted.org/packages/1a/c9/82638ef24d7877510f83baf821f5619a61b45568ce21c0a87a91576510aa/torch-2.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:0f68f4ac6d95d12e896c3b7a912b5871619542ec54d3649cf48cc1edd4dd2756", size = 530712279, upload-time = "2026-03-23T18:10:31.481Z" },
@@ -6684,8 +8330,8 @@ dependencies = [
     { name = "docker" },
     { name = "docstring-parser" },
     { name = "filelock" },
-    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'win32' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "fsspec", version = "2026.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32') or (python_full_version < '3.14' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts') or (sys_platform != 'win32' and extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "fsspec", version = "2026.2.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or sys_platform != 'win32' or extra == 'extra-13-megatron-core-dev' or extra == 'extra-13-megatron-core-lts'" },
+    { name = "fsspec", version = "2026.3.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.14' and sys_platform == 'win32' and extra != 'extra-13-megatron-core-dev' and extra != 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "importlib-metadata" },
     { name = "pyre-extensions" },
     { name = "pyyaml" },
@@ -6721,7 +8367,7 @@ dependencies = [
     { name = "onnx", version = "1.21.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "onnxscript", version = "0.5.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "onnxscript", version = "0.6.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.13' and extra == 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" } },
+    { name = "packaging" },
     { name = "pydantic" },
     { name = "torch", marker = "sys_platform == 'never'" },
 ]
@@ -6733,10 +8379,9 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "filelock" },
     { name = "huggingface-hub" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra != 'extra-13-megatron-core-dev') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
+    { name = "packaging" },
     { name = "pyyaml" },
     { name = "regex" },
     { name = "requests" },
@@ -6754,6 +8399,10 @@ name = "triton"
 version = "3.6.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/44/ba/b1b04f4b291a3205d95ebd24465de0e5bf010a2df27a4e58a9b5f039d8f2/triton-3.6.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6c723cfb12f6842a0ae94ac307dba7e7a44741d720a40cf0e270ed4a4e3be781", size = 175972180, upload-time = "2026-01-20T16:15:53.664Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/f7/f1c9d3424ab199ac53c2da567b859bcddbb9c9e7154805119f8bd95ec36f/triton-3.6.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6550fae429e0667e397e5de64b332d1e5695b73650ee75a6146e2e902770bea", size = 188105201, upload-time = "2026-01-20T16:00:29.272Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/2c/96f92f3c60387e14cc45aed49487f3486f89ea27106c1b1376913c62abe4/triton-3.6.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:49df5ef37379c0c2b5c0012286f80174fcf0e073e5ade1ca9a86c36814553651", size = 176081190, upload-time = "2026-01-20T16:16:00.523Z" },
+    { url = "https://files.pythonhosted.org/packages/e0/12/b05ba554d2c623bffa59922b94b0775673de251f468a9609bc9e45de95e9/triton-3.6.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e8e323d608e3a9bfcc2d9efcc90ceefb764a82b99dea12a86d643c72539ad5d3", size = 188214640, upload-time = "2026-01-20T16:00:35.869Z" },
     { url = "https://files.pythonhosted.org/packages/17/5d/08201db32823bdf77a0e2b9039540080b2e5c23a20706ddba942924ebcd6/triton-3.6.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:374f52c11a711fd062b4bfbb201fd9ac0a5febd28a96fb41b4a0f51dde3157f4", size = 176128243, upload-time = "2026-01-20T16:16:07.857Z" },
     { url = "https://files.pythonhosted.org/packages/ab/a8/cdf8b3e4c98132f965f88c2313a4b493266832ad47fb52f23d14d4f86bb5/triton-3.6.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:74caf5e34b66d9f3a429af689c1c7128daba1d8208df60e81106b115c00d6fca", size = 188266850, upload-time = "2026-01-20T16:00:43.041Z" },
     { url = "https://files.pythonhosted.org/packages/3c/12/34d71b350e89a204c2c7777a9bba0dcf2f19a5bfdd70b57c4dbc5ffd7154/triton-3.6.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:448e02fe6dc898e9e5aa89cf0ee5c371e99df5aa5e8ad976a80b93334f3494fd", size = 176133521, upload-time = "2026-01-20T16:16:13.321Z" },
@@ -6862,45 +8511,53 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/65/36/2d24b2cbe78547c6532da33fb8613debd3126eccc33a6374ab788f5e46e9/uuid_utils-0.14.1-cp39-abi3-win32.whl", hash = "sha256:b54d6aa6252d96bac1fdbc80d26ba71bad9f220b2724d692ad2f2310c22ef523", size = 183476, upload-time = "2026-02-20T22:50:32.745Z" },
     { url = "https://files.pythonhosted.org/packages/83/92/2d7e90df8b1a69ec4cff33243ce02b7a62f926ef9e2f0eca5a026889cd73/uuid_utils-0.14.1-cp39-abi3-win_amd64.whl", hash = "sha256:fc27638c2ce267a0ce3e06828aff786f91367f093c80625ee21dad0208e0f5ba", size = 187147, upload-time = "2026-02-20T22:50:45.807Z" },
     { url = "https://files.pythonhosted.org/packages/d9/26/529f4beee17e5248e37e0bc17a2761d34c0fa3b1e5729c88adb2065bae6e/uuid_utils-0.14.1-cp39-abi3-win_arm64.whl", hash = "sha256:b04cb49b42afbc4ff8dbc60cf054930afc479d6f4dd7f1ec3bbe5dbfdde06b7a", size = 188132, upload-time = "2026-02-20T22:50:41.718Z" },
+    { url = "https://files.pythonhosted.org/packages/91/f9/6c64bdbf71f58ccde7919e00491812556f446a5291573af92c49a5e9aaef/uuid_utils-0.14.1-pp311-pypy311_pp73-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:b197cd5424cf89fb019ca7f53641d05bfe34b1879614bed111c9c313b5574cd8", size = 591617, upload-time = "2026-02-20T22:50:24.532Z" },
+    { url = "https://files.pythonhosted.org/packages/d0/f0/758c3b0fb0c4871c7704fef26a5bc861de4f8a68e4831669883bebe07b0f/uuid_utils-0.14.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:12c65020ba6cb6abe1d57fcbfc2d0ea0506c67049ee031714057f5caf0f9bc9c", size = 303702, upload-time = "2026-02-20T22:50:40.687Z" },
+    { url = "https://files.pythonhosted.org/packages/85/89/d91862b544c695cd58855efe3201f83894ed82fffe34500774238ab8eba7/uuid_utils-0.14.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b5d2ad28063d422ccc2c28d46471d47b61a58de885d35113a8f18cb547e25bf", size = 337678, upload-time = "2026-02-20T22:50:39.768Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/6b/cf342ba8a898f1de024be0243fac67c025cad530c79ea7f89c4ce718891a/uuid_utils-0.14.1-pp311-pypy311_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:da2234387b45fde40b0fedfee64a0ba591caeea9c48c7698ab6e2d85c7991533", size = 343711, upload-time = "2026-02-20T22:50:43.965Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/20/049418d094d396dfa6606b30af925cc68a6670c3b9103b23e6990f84b589/uuid_utils-0.14.1-pp311-pypy311_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50fffc2827348c1e48972eed3d1c698959e63f9d030aa5dd82ba451113158a62", size = 476731, upload-time = "2026-02-20T22:50:30.589Z" },
+    { url = "https://files.pythonhosted.org/packages/77/a1/0857f64d53a90321e6a46a3d4cc394f50e1366132dcd2ae147f9326ca98b/uuid_utils-0.14.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1dbe718765f70f5b7f9b7f66b6a937802941b1cc56bcf642ce0274169741e01", size = 338902, upload-time = "2026-02-20T22:50:33.927Z" },
+    { url = "https://files.pythonhosted.org/packages/ed/d0/5bf7cbf1ac138c92b9ac21066d18faf4d7e7f651047b700eb192ca4b9fdb/uuid_utils-0.14.1-pp311-pypy311_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:258186964039a8e36db10810c1ece879d229b01331e09e9030bc5dcabe231bd2", size = 364700, upload-time = "2026-02-20T22:50:21.732Z" },
 ]
 
 [[package]]
 name = "uvicorn"
-version = "0.44.0"
+version = "0.43.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
     { name = "h11" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/5e/da/6eee1ff8b6cbeed47eeb5229749168e81eb4b7b999a1a15a7176e51410c9/uvicorn-0.44.0.tar.gz", hash = "sha256:6c942071b68f07e178264b9152f1f16dfac5da85880c4ce06366a96d70d4f31e", size = 86947, upload-time = "2026-04-06T09:23:22.826Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/62/f2/368268300fb8af33743508d738ef7bb4d56afdb46c6d9c0fa3dd515df171/uvicorn-0.43.0.tar.gz", hash = "sha256:ab1652d2fb23abf124f36ccc399828558880def222c3cb3d98d24021520dc6e8", size = 85686, upload-time = "2026-04-03T18:37:48.984Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/b7/23/a5bbd9600dd607411fa644c06ff4951bec3a4d82c4b852374024359c19c0/uvicorn-0.44.0-py3-none-any.whl", hash = "sha256:ce937c99a2cc70279556967274414c087888e8cec9f9c94644dfca11bd3ced89", size = 69425, upload-time = "2026-04-06T09:23:21.524Z" },
+    { url = "https://files.pythonhosted.org/packages/55/df/0cf5b0c451602748fdc7a702d4667f6e209bf96aa6e3160d754234445f2a/uvicorn-0.43.0-py3-none-any.whl", hash = "sha256:46fac64f487fd968cd999e5e49efbbe64bd231b5bd8b4a0b482a23ebce499620", size = 68591, upload-time = "2026-04-03T18:37:47.64Z" },
 ]
 
 [[package]]
 name = "virtualenv"
-version = "21.2.4"
+version = "21.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "distlib" },
     { name = "filelock" },
     { name = "platformdirs" },
     { name = "python-discovery" },
+    { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/0c/98/3a7e644e19cb26133488caff231be390579860bbbb3da35913c49a1d0a46/virtualenv-21.2.4.tar.gz", hash = "sha256:b294ef68192638004d72524ce7ef303e9d0cf5a44c95ce2e54a7500a6381cada", size = 5850742, upload-time = "2026-04-14T22:15:31.438Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/aa/92/58199fe10049f9703c2666e809c4f686c54ef0a68b0f6afccf518c0b1eb9/virtualenv-21.2.0.tar.gz", hash = "sha256:1720dc3a62ef5b443092e3f499228599045d7fea4c79199770499df8becf9098", size = 5840618, upload-time = "2026-03-09T17:24:38.013Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/27/8d/edd0bd910ff803c308ee9a6b7778621af0d10252219ad9f19ef4d4982a61/virtualenv-21.2.4-py3-none-any.whl", hash = "sha256:29d21e941795206138d0f22f4e45ff7050e5da6c6472299fb7103318763861ac", size = 5831232, upload-time = "2026-04-14T22:15:29.342Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/59/7d02447a55b2e55755011a647479041bc92a82e143f96a8195cb33bd0a1c/virtualenv-21.2.0-py3-none-any.whl", hash = "sha256:1bd755b504931164a5a496d217c014d098426cddc79363ad66ac78125f9d908f", size = 5825084, upload-time = "2026-03-09T17:24:35.378Z" },
 ]
 
 [[package]]
 name = "wandb"
-version = "0.26.0"
+version = "0.25.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "click" },
     { name = "gitpython" },
-    { name = "packaging", version = "25.0", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "packaging", version = "26.1", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts' or extra != 'extra-13-megatron-core-dev'" },
+    { name = "packaging" },
     { name = "platformdirs" },
     { name = "protobuf" },
     { name = "pydantic" },
@@ -6909,17 +8566,17 @@ dependencies = [
     { name = "sentry-sdk" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/93/82/911948663ddf9e5ec6bc5cde19b0fffcb23c4b64a546bf5c084fde76c4cb/wandb-0.26.0.tar.gz", hash = "sha256:0356853895b53fe110e2ed17a1d49c15405498f08e5fbc339deab384f2df45f1", size = 42120837, upload-time = "2026-04-13T19:42:47.282Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/60/bb/eb579bf9abac70934a014a9d4e45346aab307994f3021d201bebe5fa25ec/wandb-0.25.1.tar.gz", hash = "sha256:b2a95cd777ecbe7499599a43158834983448a0048329bc7210ef46ca18d21994", size = 43983308, upload-time = "2026-03-10T23:51:44.227Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/69/16/c0ea55323be74da9b4297d934b8e787251ae944d5776340c5498871927f1/wandb-0.26.0-py3-none-macosx_12_0_arm64.whl", hash = "sha256:1ece94a2a5eda1d0e3a2d8a2fd28aa0187705d6efa5ac4c0b8680083583b7ec1", size = 24800103, upload-time = "2026-04-13T19:42:23.946Z" },
-    { url = "https://files.pythonhosted.org/packages/29/b8/4d38b43747616c4a9304be38b6e78526814deb5c1e01b3b6ebac82ce1cb5/wandb-0.26.0-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:92f6f303fe2af50e3f711833a835150f9b4d8082874bfd9868cf15491ea2947e", size = 25956473, upload-time = "2026-04-13T19:42:26.772Z" },
-    { url = "https://files.pythonhosted.org/packages/45/a6/940ca459d70c7cce7a6f7b395809f8ed051a25b2ce696fb93694c77f065f/wandb-0.26.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:a17aae051a31831388cff880251c1b5bc38fbf6a283a0ee7c543709e8e9633d1", size = 25352442, upload-time = "2026-04-13T19:42:29.438Z" },
-    { url = "https://files.pythonhosted.org/packages/4f/a8/55325da4b240d07ba2a8e1949a05b5942dd3346e14f7fd5e3cc72e46a648/wandb-0.26.0-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:28a14ffc014e523498d077dfde12839b7be586ca8c3190e72e7167c1aea6ee4c", size = 27177821, upload-time = "2026-04-13T19:42:32.055Z" },
-    { url = "https://files.pythonhosted.org/packages/c4/84/e4b0636a3e921e2cffb159b57b5a83787475993e2b5adb6181fbf7712a59/wandb-0.26.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:fb9a63babeee044fecf65a4675f7dfb0efaea4986e498a3bc8f948558af877e7", size = 25522688, upload-time = "2026-04-13T19:42:34.625Z" },
-    { url = "https://files.pythonhosted.org/packages/dd/f0/821a451110dd5f5c39358752abbdcb56c4fcebcc41039c7dcd4b024d2e27/wandb-0.26.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:3eb88fb556a64bf4492cf571bb851d47871901c096f0540d841ccb50f5cbeb66", size = 27486467, upload-time = "2026-04-13T19:42:37.119Z" },
-    { url = "https://files.pythonhosted.org/packages/24/c5/a4eb8fb6e7527584c6ccdf5c9b265283ed0c9d94d26d5eb28f9b48cd5779/wandb-0.26.0-py3-none-win32.whl", hash = "sha256:362828d48d21dd4877e28fdce40421ebdfc16d1fe0b59e8371b12d75bbc3f1e7", size = 24908555, upload-time = "2026-04-13T19:42:39.416Z" },
-    { url = "https://files.pythonhosted.org/packages/35/3d/bf182f3af977e6297fc05bc3fd9bd51feacfe4d2c4ce83c90eb2ad7ce59b/wandb-0.26.0-py3-none-win_amd64.whl", hash = "sha256:21a8346434fd30e1bc13a26b226fc29b6f33a1cb346d610cbcb4040c3b0e1f63", size = 24908559, upload-time = "2026-04-13T19:42:42.019Z" },
-    { url = "https://files.pythonhosted.org/packages/1a/3e/344cb29b593f8e7abc14cc268dafde1974bae3f073b4885476f4fbba3cb8/wandb-0.26.0-py3-none-win_arm64.whl", hash = "sha256:99bd11974e9005d3a3f82e1fabfc4909ffa1fdede23a8839f5fbaea2f5be9033", size = 22936140, upload-time = "2026-04-13T19:42:44.715Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/d8/873553b6818499d1b1de314067d528b892897baf0dc81fedc0e845abc2dd/wandb-0.25.1-py3-none-macosx_12_0_arm64.whl", hash = "sha256:9bb0679a3e2dcd96db9d9b6d3e17d046241d8d122974b24facb85cc93309a8c9", size = 23615900, upload-time = "2026-03-10T23:51:06.278Z" },
+    { url = "https://files.pythonhosted.org/packages/71/ea/b131f319aaa5d0bf7572b6bfcff3dd89e1cf92b17eee443bbab71d12d74c/wandb-0.25.1-py3-none-macosx_12_0_x86_64.whl", hash = "sha256:0fb13ed18914027523e7b4fc20380c520e0d10da0ee452f924a13f84509fbe12", size = 25576144, upload-time = "2026-03-10T23:51:11.527Z" },
+    { url = "https://files.pythonhosted.org/packages/70/5f/81508581f0bb77b0495665c1c78e77606a48e66e855ca71ba7c8ae29efa4/wandb-0.25.1-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:cc4521eb5223429ddab5e8eee9b42fdf4caabdf0bc4e0e809042720e5fbef0ed", size = 23070425, upload-time = "2026-03-10T23:51:15.71Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/c7/445155ef010e2e35d190797d7c36ff441e062a5b566a6da4778e22233395/wandb-0.25.1-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:e73b4c55b947edae349232d5845204d30fac88e18eb4ad1d4b96bf7cf898405a", size = 25628142, upload-time = "2026-03-10T23:51:19.326Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/63/f5c55ee00cf481ef1ccd3c385a0585ad52e7840d08419d4f82ddbeeea959/wandb-0.25.1-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:22b84065aa398e1624d2e5ad79e08bc4d2af41a6db61697b03b3aaba332977c6", size = 23123172, upload-time = "2026-03-10T23:51:23.418Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/d9/19eb7974c0e9253bcbaee655222c0f0e1a52e63e9479ee711b4208f8ac31/wandb-0.25.1-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:005c4c6b5126ef8f4b4110e5372d950918b00637d6dc4b615ad17445f9739478", size = 25714479, upload-time = "2026-03-10T23:51:27.421Z" },
+    { url = "https://files.pythonhosted.org/packages/11/19/466c1d03323a4a0ed7d4036a59b18d6b6f67cb5032e444205927e226b18d/wandb-0.25.1-py3-none-win32.whl", hash = "sha256:8f2d04f16b88d65bfba9d79fb945f6c64e2686215469a841936e0972be8ec6a5", size = 24967338, upload-time = "2026-03-10T23:51:31.833Z" },
+    { url = "https://files.pythonhosted.org/packages/89/22/680d34c1587f3a979c701b66d71aa7c42b4ef2fdf0774f67034e618e834e/wandb-0.25.1-py3-none-win_amd64.whl", hash = "sha256:62db5166de14456156d7a85953a58733a631228e6d4248a753605f75f75fb845", size = 24967343, upload-time = "2026-03-10T23:51:36.026Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/e8/76836b75d401ff5912aaf513176e64557ceaec4c4946bfd38a698ff84d48/wandb-0.25.1-py3-none-win_arm64.whl", hash = "sha256:cc7c34b70cf4b7be4d395541e82e325fd9d2be978d62c9ec01f1a7141523b6bb", size = 22080774, upload-time = "2026-03-10T23:51:40.196Z" },
 ]
 
 [[package]]
@@ -6931,6 +8588,31 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c2/c9/8869df9b2a2d6c59d79220a4db37679e74f807c559ffe5265e08b227a210/watchfiles-1.1.1.tar.gz", hash = "sha256:a173cb5c16c4f40ab19cecf48a534c409f7ea983ab8fed0741304a1c0a31b3f2", size = 94440, upload-time = "2025-10-14T15:06:21.08Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/a7/1a/206e8cf2dd86fddf939165a57b4df61607a1e0add2785f170a3f616b7d9f/watchfiles-1.1.1-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:eef58232d32daf2ac67f42dea51a2c80f0d03379075d44a587051e63cc2e368c", size = 407318, upload-time = "2025-10-14T15:04:18.753Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/0f/abaf5262b9c496b5dad4ed3c0e799cbecb1f8ea512ecb6ddd46646a9fca3/watchfiles-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:03fa0f5237118a0c5e496185cafa92878568b652a2e9a9382a5151b1a0380a43", size = 394478, upload-time = "2025-10-14T15:04:20.297Z" },
+    { url = "https://files.pythonhosted.org/packages/b1/04/9cc0ba88697b34b755371f5ace8d3a4d9a15719c07bdc7bd13d7d8c6a341/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8ca65483439f9c791897f7db49202301deb6e15fe9f8fe2fed555bf986d10c31", size = 449894, upload-time = "2025-10-14T15:04:21.527Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/9c/eda4615863cd8621e89aed4df680d8c3ec3da6a4cf1da113c17decd87c7f/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f0ab1c1af0cb38e3f598244c17919fb1a84d1629cc08355b0074b6d7f53138ac", size = 459065, upload-time = "2025-10-14T15:04:22.795Z" },
+    { url = "https://files.pythonhosted.org/packages/84/13/f28b3f340157d03cbc8197629bc109d1098764abe1e60874622a0be5c112/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3bc570d6c01c206c46deb6e935a260be44f186a2f05179f52f7fcd2be086a94d", size = 488377, upload-time = "2025-10-14T15:04:24.138Z" },
+    { url = "https://files.pythonhosted.org/packages/86/93/cfa597fa9389e122488f7ffdbd6db505b3b915ca7435ecd7542e855898c2/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e84087b432b6ac94778de547e08611266f1f8ffad28c0ee4c82e028b0fc5966d", size = 595837, upload-time = "2025-10-14T15:04:25.057Z" },
+    { url = "https://files.pythonhosted.org/packages/57/1e/68c1ed5652b48d89fc24d6af905d88ee4f82fa8bc491e2666004e307ded1/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:620bae625f4cb18427b1bb1a2d9426dc0dd5a5ba74c7c2cdb9de405f7b129863", size = 473456, upload-time = "2025-10-14T15:04:26.497Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/dc/1a680b7458ffa3b14bb64878112aefc8f2e4f73c5af763cbf0bd43100658/watchfiles-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:544364b2b51a9b0c7000a4b4b02f90e9423d97fbbf7e06689236443ebcad81ab", size = 455614, upload-time = "2025-10-14T15:04:27.539Z" },
+    { url = "https://files.pythonhosted.org/packages/61/a5/3d782a666512e01eaa6541a72ebac1d3aae191ff4a31274a66b8dd85760c/watchfiles-1.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:bbe1ef33d45bc71cf21364df962af171f96ecaeca06bd9e3d0b583efb12aec82", size = 630690, upload-time = "2025-10-14T15:04:28.495Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/73/bb5f38590e34687b2a9c47a244aa4dd50c56a825969c92c9c5fc7387cea1/watchfiles-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1a0bb430adb19ef49389e1ad368450193a90038b5b752f4ac089ec6942c4dff4", size = 622459, upload-time = "2025-10-14T15:04:29.491Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/ac/c9bb0ec696e07a20bd58af5399aeadaef195fb2c73d26baf55180fe4a942/watchfiles-1.1.1-cp310-cp310-win32.whl", hash = "sha256:3f6d37644155fb5beca5378feb8c1708d5783145f2a0f1c4d5a061a210254844", size = 272663, upload-time = "2025-10-14T15:04:30.435Z" },
+    { url = "https://files.pythonhosted.org/packages/11/a0/a60c5a7c2ec59fa062d9a9c61d02e3b6abd94d32aac2d8344c4bdd033326/watchfiles-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:a36d8efe0f290835fd0f33da35042a1bb5dc0e83cbc092dcf69bce442579e88e", size = 287453, upload-time = "2025-10-14T15:04:31.53Z" },
+    { url = "https://files.pythonhosted.org/packages/1f/f8/2c5f479fb531ce2f0564eda479faecf253d886b1ab3630a39b7bf7362d46/watchfiles-1.1.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:f57b396167a2565a4e8b5e56a5a1c537571733992b226f4f1197d79e94cf0ae5", size = 406529, upload-time = "2025-10-14T15:04:32.899Z" },
+    { url = "https://files.pythonhosted.org/packages/fe/cd/f515660b1f32f65df671ddf6f85bfaca621aee177712874dc30a97397977/watchfiles-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:421e29339983e1bebc281fab40d812742268ad057db4aee8c4d2bce0af43b741", size = 394384, upload-time = "2025-10-14T15:04:33.761Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/c3/28b7dc99733eab43fca2d10f55c86e03bd6ab11ca31b802abac26b23d161/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6e43d39a741e972bab5d8100b5cdacf69db64e34eb19b6e9af162bccf63c5cc6", size = 448789, upload-time = "2025-10-14T15:04:34.679Z" },
+    { url = "https://files.pythonhosted.org/packages/4a/24/33e71113b320030011c8e4316ccca04194bf0cbbaeee207f00cbc7d6b9f5/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f537afb3276d12814082a2e9b242bdcf416c2e8fd9f799a737990a1dbe906e5b", size = 460521, upload-time = "2025-10-14T15:04:35.963Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/c3/3c9a55f255aa57b91579ae9e98c88704955fa9dac3e5614fb378291155df/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b2cd9e04277e756a2e2d2543d65d1e2166d6fd4c9b183f8808634fda23f17b14", size = 488722, upload-time = "2025-10-14T15:04:37.091Z" },
+    { url = "https://files.pythonhosted.org/packages/49/36/506447b73eb46c120169dc1717fe2eff07c234bb3232a7200b5f5bd816e9/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5f3f58818dc0b07f7d9aa7fe9eb1037aecb9700e63e1f6acfed13e9fef648f5d", size = 596088, upload-time = "2025-10-14T15:04:38.39Z" },
+    { url = "https://files.pythonhosted.org/packages/82/ab/5f39e752a9838ec4d52e9b87c1e80f1ee3ccdbe92e183c15b6577ab9de16/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bb9f66367023ae783551042d31b1d7fd422e8289eedd91f26754a66f44d5cff", size = 472923, upload-time = "2025-10-14T15:04:39.666Z" },
+    { url = "https://files.pythonhosted.org/packages/af/b9/a419292f05e302dea372fa7e6fda5178a92998411f8581b9830d28fb9edb/watchfiles-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aebfd0861a83e6c3d1110b78ad54704486555246e542be3e2bb94195eabb2606", size = 456080, upload-time = "2025-10-14T15:04:40.643Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/c3/d5932fd62bde1a30c36e10c409dc5d54506726f08cb3e1d8d0ba5e2bc8db/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:5fac835b4ab3c6487b5dbad78c4b3724e26bcc468e886f8ba8cc4306f68f6701", size = 629432, upload-time = "2025-10-14T15:04:41.789Z" },
+    { url = "https://files.pythonhosted.org/packages/f7/77/16bddd9779fafb795f1a94319dc965209c5641db5bf1edbbccace6d1b3c0/watchfiles-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:399600947b170270e80134ac854e21b3ccdefa11a9529a3decc1327088180f10", size = 623046, upload-time = "2025-10-14T15:04:42.718Z" },
+    { url = "https://files.pythonhosted.org/packages/46/ef/f2ecb9a0f342b4bfad13a2787155c6ee7ce792140eac63a34676a2feeef2/watchfiles-1.1.1-cp311-cp311-win32.whl", hash = "sha256:de6da501c883f58ad50db3a32ad397b09ad29865b5f26f64c24d3e3281685849", size = 271473, upload-time = "2025-10-14T15:04:43.624Z" },
+    { url = "https://files.pythonhosted.org/packages/94/bc/f42d71125f19731ea435c3948cad148d31a64fccde3867e5ba4edee901f9/watchfiles-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:35c53bd62a0b885bf653ebf6b700d1bf05debb78ad9292cf2a942b23513dc4c4", size = 287598, upload-time = "2025-10-14T15:04:44.516Z" },
+    { url = "https://files.pythonhosted.org/packages/57/c9/a30f897351f95bbbfb6abcadafbaca711ce1162f4db95fc908c98a9165f3/watchfiles-1.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:57ca5281a8b5e27593cb7d82c2ac927ad88a96ed406aa446f6344e4328208e9e", size = 277210, upload-time = "2025-10-14T15:04:45.883Z" },
     { url = "https://files.pythonhosted.org/packages/74/d5/f039e7e3c639d9b1d09b07ea412a6806d38123f0508e5f9b48a87b0a76cc/watchfiles-1.1.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:8c89f9f2f740a6b7dcc753140dd5e1ab9215966f7a3530d0c0705c83b401bd7d", size = 404745, upload-time = "2025-10-14T15:04:46.731Z" },
     { url = "https://files.pythonhosted.org/packages/a5/96/a881a13aa1349827490dab2d363c8039527060cfcc2c92cc6d13d1b1049e/watchfiles-1.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bd404be08018c37350f0d6e34676bd1e2889990117a2b90070b3007f172d0610", size = 391769, upload-time = "2025-10-14T15:04:48.003Z" },
     { url = "https://files.pythonhosted.org/packages/4b/5b/d3b460364aeb8da471c1989238ea0e56bec24b6042a68046adf3d9ddb01c/watchfiles-1.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8526e8f916bb5b9a0a777c8317c23ce65de259422bba5b31325a6fa6029d33af", size = 449374, upload-time = "2025-10-14T15:04:49.179Z" },
@@ -6990,6 +8672,14 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4f/55/2af26693fd15165c4ff7857e38330e1b61ab8c37d15dc79118cdba115b7a/watchfiles-1.1.1-cp314-cp314t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c91ed27800188c2ae96d16e3149f199d62f86c7af5f5f4d2c61a3ed8cd3666c", size = 455072, upload-time = "2025-10-14T15:05:48.928Z" },
     { url = "https://files.pythonhosted.org/packages/66/1d/d0d200b10c9311ec25d2273f8aad8c3ef7cc7ea11808022501811208a750/watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_aarch64.whl", hash = "sha256:311ff15a0bae3714ffb603e6ba6dbfba4065ab60865d15a6ec544133bdb21099", size = 629104, upload-time = "2025-10-14T15:05:49.908Z" },
     { url = "https://files.pythonhosted.org/packages/e3/bd/fa9bb053192491b3867ba07d2343d9f2252e00811567d30ae8d0f78136fe/watchfiles-1.1.1-cp314-cp314t-musllinux_1_1_x86_64.whl", hash = "sha256:a916a2932da8f8ab582f242c065f5c81bed3462849ca79ee357dd9551b0e9b01", size = 622112, upload-time = "2025-10-14T15:05:50.941Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/4c/a888c91e2e326872fa4705095d64acd8aa2fb9c1f7b9bd0588f33850516c/watchfiles-1.1.1-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:17ef139237dfced9da49fb7f2232c86ca9421f666d78c264c7ffca6601d154c3", size = 409611, upload-time = "2025-10-14T15:06:05.809Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/c7/5420d1943c8e3ce1a21c0a9330bcf7edafb6aa65d26b21dbb3267c9e8112/watchfiles-1.1.1-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:672b8adf25b1a0d35c96b5888b7b18699d27d4194bac8beeae75be4b7a3fc9b2", size = 396889, upload-time = "2025-10-14T15:06:07.035Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/e5/0072cef3804ce8d3aaddbfe7788aadff6b3d3f98a286fdbee9fd74ca59a7/watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77a13aea58bc2b90173bc69f2a90de8e282648939a00a602e1dc4ee23e26b66d", size = 451616, upload-time = "2025-10-14T15:06:08.072Z" },
+    { url = "https://files.pythonhosted.org/packages/83/4e/b87b71cbdfad81ad7e83358b3e447fedd281b880a03d64a760fe0a11fc2e/watchfiles-1.1.1-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b495de0bb386df6a12b18335a0285dda90260f51bdb505503c02bcd1ce27a8b", size = 458413, upload-time = "2025-10-14T15:06:09.209Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/8e/e500f8b0b77be4ff753ac94dc06b33d8f0d839377fee1b78e8c8d8f031bf/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_10_12_x86_64.whl", hash = "sha256:db476ab59b6765134de1d4fe96a1a9c96ddf091683599be0f26147ea1b2e4b88", size = 408250, upload-time = "2025-10-14T15:06:10.264Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/95/615e72cd27b85b61eec764a5ca51bd94d40b5adea5ff47567d9ebc4d275a/watchfiles-1.1.1-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:89eef07eee5e9d1fda06e38822ad167a044153457e6fd997f8a858ab7564a336", size = 396117, upload-time = "2025-10-14T15:06:11.28Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/81/e7fe958ce8a7fb5c73cc9fb07f5aeaf755e6aa72498c57d760af760c91f8/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce19e06cbda693e9e7686358af9cd6f5d61312ab8b00488bc36f5aabbaf77e24", size = 450493, upload-time = "2025-10-14T15:06:12.321Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/d4/ed38dd3b1767193de971e694aa544356e63353c33a85d948166b5ff58b9e/watchfiles-1.1.1-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e6f39af2eab0118338902798b5aa6664f46ff66bc0280de76fca67a7f262a49", size = 457546, upload-time = "2025-10-14T15:06:13.372Z" },
 ]
 
 [[package]]
@@ -7019,8 +8709,8 @@ version = "1.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "braceexpand" },
-    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-dev'" },
-    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'extra-13-megatron-core-lts'" },
+    { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra == 'extra-13-megatron-core-lts') or extra == 'extra-13-megatron-core-dev'" },
+    { name = "numpy", version = "2.4.4", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra == 'extra-13-megatron-core-lts') or (extra == 'extra-13-megatron-core-dev' and extra == 'extra-13-megatron-core-lts')" },
     { name = "pyyaml" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5a/3a/68800d92e065cf4750ebecf973b13979c0c929b439e1293012938862038d/webdataset-1.0.2.tar.gz", hash = "sha256:7f0498be827cfa46cc5430a58768a24e2c6a410676a61be1838f53d61afdaab4", size = 80090, upload-time = "2025-06-19T23:26:21.945Z" }
@@ -7034,6 +8724,24 @@ version = "16.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/04/24/4b2031d72e840ce4c1ccb255f693b15c334757fc50023e4db9537080b8c4/websockets-16.0.tar.gz", hash = "sha256:5f6261a5e56e8d5c42a4497b364ea24d94d9563e8fbd44e78ac40879c60179b5", size = 179346, upload-time = "2026-01-10T09:23:47.181Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/20/74/221f58decd852f4b59cc3354cccaf87e8ef695fede361d03dc9a7396573b/websockets-16.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:04cdd5d2d1dacbad0a7bf36ccbcd3ccd5a30ee188f2560b7a62a30d14107b31a", size = 177343, upload-time = "2026-01-10T09:22:21.28Z" },
+    { url = "https://files.pythonhosted.org/packages/19/0f/22ef6107ee52ab7f0b710d55d36f5a5d3ef19e8a205541a6d7ffa7994e5a/websockets-16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8ff32bb86522a9e5e31439a58addbb0166f0204d64066fb955265c4e214160f0", size = 175021, upload-time = "2026-01-10T09:22:22.696Z" },
+    { url = "https://files.pythonhosted.org/packages/10/40/904a4cb30d9b61c0e278899bf36342e9b0208eb3c470324a9ecbaac2a30f/websockets-16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:583b7c42688636f930688d712885cf1531326ee05effd982028212ccc13e5957", size = 175320, upload-time = "2026-01-10T09:22:23.94Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/2f/4b3ca7e106bc608744b1cdae041e005e446124bebb037b18799c2d356864/websockets-16.0-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:7d837379b647c0c4c2355c2499723f82f1635fd2c26510e1f587d89bc2199e72", size = 183815, upload-time = "2026-01-10T09:22:25.469Z" },
+    { url = "https://files.pythonhosted.org/packages/86/26/d40eaa2a46d4302becec8d15b0fc5e45bdde05191e7628405a19cf491ccd/websockets-16.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:df57afc692e517a85e65b72e165356ed1df12386ecb879ad5693be08fac65dde", size = 185054, upload-time = "2026-01-10T09:22:27.101Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/ba/6500a0efc94f7373ee8fefa8c271acdfd4dca8bd49a90d4be7ccabfc397e/websockets-16.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:2b9f1e0d69bc60a4a87349d50c09a037a2607918746f07de04df9e43252c77a3", size = 184565, upload-time = "2026-01-10T09:22:28.293Z" },
+    { url = "https://files.pythonhosted.org/packages/04/b4/96bf2cee7c8d8102389374a2616200574f5f01128d1082f44102140344cc/websockets-16.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:335c23addf3d5e6a8633f9f8eda77efad001671e80b95c491dd0924587ece0b3", size = 183848, upload-time = "2026-01-10T09:22:30.394Z" },
+    { url = "https://files.pythonhosted.org/packages/02/8e/81f40fb00fd125357814e8c3025738fc4ffc3da4b6b4a4472a82ba304b41/websockets-16.0-cp310-cp310-win32.whl", hash = "sha256:37b31c1623c6605e4c00d466c9d633f9b812ea430c11c8a278774a1fde1acfa9", size = 178249, upload-time = "2026-01-10T09:22:32.083Z" },
+    { url = "https://files.pythonhosted.org/packages/b4/5f/7e40efe8df57db9b91c88a43690ac66f7b7aa73a11aa6a66b927e44f26fa/websockets-16.0-cp310-cp310-win_amd64.whl", hash = "sha256:8e1dab317b6e77424356e11e99a432b7cb2f3ec8c5ab4dabbcee6add48f72b35", size = 178685, upload-time = "2026-01-10T09:22:33.345Z" },
+    { url = "https://files.pythonhosted.org/packages/f2/db/de907251b4ff46ae804ad0409809504153b3f30984daf82a1d84a9875830/websockets-16.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:31a52addea25187bde0797a97d6fc3d2f92b6f72a9370792d65a6e84615ac8a8", size = 177340, upload-time = "2026-01-10T09:22:34.539Z" },
+    { url = "https://files.pythonhosted.org/packages/f3/fa/abe89019d8d8815c8781e90d697dec52523fb8ebe308bf11664e8de1877e/websockets-16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:417b28978cdccab24f46400586d128366313e8a96312e4b9362a4af504f3bbad", size = 175022, upload-time = "2026-01-10T09:22:36.332Z" },
+    { url = "https://files.pythonhosted.org/packages/58/5d/88ea17ed1ded2079358b40d31d48abe90a73c9e5819dbcde1606e991e2ad/websockets-16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:af80d74d4edfa3cb9ed973a0a5ba2b2a549371f8a741e0800cb07becdd20f23d", size = 175319, upload-time = "2026-01-10T09:22:37.602Z" },
+    { url = "https://files.pythonhosted.org/packages/d2/ae/0ee92b33087a33632f37a635e11e1d99d429d3d323329675a6022312aac2/websockets-16.0-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:08d7af67b64d29823fed316505a89b86705f2b7981c07848fb5e3ea3020c1abe", size = 184631, upload-time = "2026-01-10T09:22:38.789Z" },
+    { url = "https://files.pythonhosted.org/packages/c8/c5/27178df583b6c5b31b29f526ba2da5e2f864ecc79c99dae630a85d68c304/websockets-16.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7be95cfb0a4dae143eaed2bcba8ac23f4892d8971311f1b06f3c6b78952ee70b", size = 185870, upload-time = "2026-01-10T09:22:39.893Z" },
+    { url = "https://files.pythonhosted.org/packages/87/05/536652aa84ddc1c018dbb7e2c4cbcd0db884580bf8e95aece7593fde526f/websockets-16.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d6297ce39ce5c2e6feb13c1a996a2ded3b6832155fcfc920265c76f24c7cceb5", size = 185361, upload-time = "2026-01-10T09:22:41.016Z" },
+    { url = "https://files.pythonhosted.org/packages/6d/e2/d5332c90da12b1e01f06fb1b85c50cfc489783076547415bf9f0a659ec19/websockets-16.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1c1b30e4f497b0b354057f3467f56244c603a79c0d1dafce1d16c283c25f6e64", size = 184615, upload-time = "2026-01-10T09:22:42.442Z" },
+    { url = "https://files.pythonhosted.org/packages/77/fb/d3f9576691cae9253b51555f841bc6600bf0a983a461c79500ace5a5b364/websockets-16.0-cp311-cp311-win32.whl", hash = "sha256:5f451484aeb5cafee1ccf789b1b66f535409d038c56966d6101740c1614b86c6", size = 178246, upload-time = "2026-01-10T09:22:43.654Z" },
+    { url = "https://files.pythonhosted.org/packages/54/67/eaff76b3dbaf18dcddabc3b8c1dba50b483761cccff67793897945b37408/websockets-16.0-cp311-cp311-win_amd64.whl", hash = "sha256:8d7f0659570eefb578dacde98e24fb60af35350193e4f56e11190787bee77dac", size = 178684, upload-time = "2026-01-10T09:22:44.941Z" },
     { url = "https://files.pythonhosted.org/packages/84/7b/bac442e6b96c9d25092695578dda82403c77936104b5682307bd4deb1ad4/websockets-16.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:71c989cbf3254fbd5e84d3bff31e4da39c43f884e64f2551d14bb3c186230f00", size = 177365, upload-time = "2026-01-10T09:22:46.787Z" },
     { url = "https://files.pythonhosted.org/packages/b0/fe/136ccece61bd690d9c1f715baaeefd953bb2360134de73519d5df19d29ca/websockets-16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8b6e209ffee39ff1b6d0fa7bfef6de950c60dfb91b8fcead17da4ee539121a79", size = 175038, upload-time = "2026-01-10T09:22:47.999Z" },
     { url = "https://files.pythonhosted.org/packages/40/1e/9771421ac2286eaab95b8575b0cb701ae3663abf8b5e1f64f1fd90d0a673/websockets-16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:86890e837d61574c92a97496d590968b23c2ef0aeb8a9bc9421d174cd378ae39", size = 175328, upload-time = "2026-01-10T09:22:49.809Z" },
@@ -7070,6 +8778,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/88/a8/a080593f89b0138b6cba1b28f8df5673b5506f72879322288b031337c0b8/websockets-16.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:32da954ffa2814258030e5a57bc73a3635463238e797c7375dc8091327434206", size = 185356, upload-time = "2026-01-10T09:23:32.627Z" },
     { url = "https://files.pythonhosted.org/packages/c2/b6/b9afed2afadddaf5ebb2afa801abf4b0868f42f8539bfe4b071b5266c9fe/websockets-16.0-cp314-cp314t-win32.whl", hash = "sha256:5a4b4cc550cb665dd8a47f868c8d04c8230f857363ad3c9caf7a0c3bf8c61ca6", size = 178085, upload-time = "2026-01-10T09:23:33.816Z" },
     { url = "https://files.pythonhosted.org/packages/9f/3e/28135a24e384493fa804216b79a6a6759a38cc4ff59118787b9fb693df93/websockets-16.0-cp314-cp314t-win_amd64.whl", hash = "sha256:b14dc141ed6d2dde437cddb216004bcac6a1df0935d79656387bd41632ba0bbd", size = 178531, upload-time = "2026-01-10T09:23:35.016Z" },
+    { url = "https://files.pythonhosted.org/packages/72/07/c98a68571dcf256e74f1f816b8cc5eae6eb2d3d5cfa44d37f801619d9166/websockets-16.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:349f83cd6c9a415428ee1005cadb5c2c56f4389bc06a9af16103c3bc3dcc8b7d", size = 174947, upload-time = "2026-01-10T09:23:36.166Z" },
+    { url = "https://files.pythonhosted.org/packages/7e/52/93e166a81e0305b33fe416338be92ae863563fe7bce446b0f687b9df5aea/websockets-16.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:4a1aba3340a8dca8db6eb5a7986157f52eb9e436b74813764241981ca4888f03", size = 175260, upload-time = "2026-01-10T09:23:37.409Z" },
+    { url = "https://files.pythonhosted.org/packages/56/0c/2dbf513bafd24889d33de2ff0368190a0e69f37bcfa19009ef819fe4d507/websockets-16.0-pp311-pypy311_pp73-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:f4a32d1bd841d4bcbffdcb3d2ce50c09c3909fbead375ab28d0181af89fd04da", size = 176071, upload-time = "2026-01-10T09:23:39.158Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/8f/aea9c71cc92bf9b6cc0f7f70df8f0b420636b6c96ef4feee1e16f80f75dd/websockets-16.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0298d07ee155e2e9fda5be8a9042200dd2e3bb0b8a38482156576f863a9d457c", size = 176968, upload-time = "2026-01-10T09:23:41.031Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/3f/f70e03f40ffc9a30d817eef7da1be72ee4956ba8d7255c399a01b135902a/websockets-16.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:a653aea902e0324b52f1613332ddf50b00c06fdaf7e92624fbf8c77c78fa5767", size = 178735, upload-time = "2026-01-10T09:23:42.259Z" },
     { url = "https://files.pythonhosted.org/packages/6f/28/258ebab549c2bf3e64d2b0217b973467394a9cea8c42f70418ca2c5d0d2e/websockets-16.0-py3-none-any.whl", hash = "sha256:1637db62fad1dc833276dded54215f2c7fa46912301a24bd94d45d46a011ceec", size = 171598, upload-time = "2026-01-10T09:23:45.395Z" },
 ]
 
@@ -7106,6 +8819,28 @@ version = "2.1.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678, upload-time = "2026-03-06T02:53:25.134Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/da/d2/387594fb592d027366645f3d7cc9b4d7ca7be93845fbaba6d835a912ef3c/wrapt-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a86d99a14f76facb269dc148590c01aaf47584071809a70da30555228158c", size = 60669, upload-time = "2026-03-06T02:52:40.671Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/18/3f373935bc5509e7ac444c8026a56762e50c1183e7061797437ca96c12ce/wrapt-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a819e39017f95bf7aede768f75915635aa8f671f2993c036991b8d3bfe8dbb6f", size = 61603, upload-time = "2026-03-06T02:54:21.032Z" },
+    { url = "https://files.pythonhosted.org/packages/c2/7a/32758ca2853b07a887a4574b74e28843919103194bb47001a304e24af62f/wrapt-2.1.2-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5681123e60aed0e64c7d44f72bbf8b4ce45f79d81467e2c4c728629f5baf06eb", size = 113632, upload-time = "2026-03-06T02:53:54.121Z" },
+    { url = "https://files.pythonhosted.org/packages/1d/d5/eeaa38f670d462e97d978b3b0d9ce06d5b91e54bebac6fbed867809216e7/wrapt-2.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b8b28e97a44d21836259739ae76284e180b18abbb4dcfdff07a415cf1016c3e", size = 115644, upload-time = "2026-03-06T02:54:53.33Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/09/2a41506cb17affb0bdf9d5e2129c8c19e192b388c4c01d05e1b14db23c00/wrapt-2.1.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cef91c95a50596fcdc31397eb6955476f82ae8a3f5a8eabdc13611b60ee380ba", size = 112016, upload-time = "2026-03-06T02:54:43.274Z" },
+    { url = "https://files.pythonhosted.org/packages/64/15/0e6c3f5e87caadc43db279724ee36979246d5194fa32fed489c73643ba59/wrapt-2.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dad63212b168de8569b1c512f4eac4b57f2c6934b30df32d6ee9534a79f1493f", size = 114823, upload-time = "2026-03-06T02:54:29.392Z" },
+    { url = "https://files.pythonhosted.org/packages/56/b2/0ad17c8248f4e57bedf44938c26ec3ee194715f812d2dbbd9d7ff4be6c06/wrapt-2.1.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d307aa6888d5efab2c1cde09843d48c843990be13069003184b67d426d145394", size = 111244, upload-time = "2026-03-06T02:54:02.149Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/04/bcdba98c26f2c6522c7c09a726d5d9229120163493620205b2f76bd13c01/wrapt-2.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c87cf3f0c85e27b3ac7d9ad95da166bf8739ca215a8b171e8404a2d739897a45", size = 113307, upload-time = "2026-03-06T02:54:12.428Z" },
+    { url = "https://files.pythonhosted.org/packages/0e/1b/5e2883c6bc14143924e465a6fc5a92d09eeabe35310842a481fb0581f832/wrapt-2.1.2-cp310-cp310-win32.whl", hash = "sha256:d1c5fea4f9fe3762e2b905fdd67df51e4be7a73b7674957af2d2ade71a5c075d", size = 57986, upload-time = "2026-03-06T02:54:26.823Z" },
+    { url = "https://files.pythonhosted.org/packages/42/5a/4efc997bccadd3af5749c250b49412793bc41e13a83a486b2b54a33e240c/wrapt-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:d8f7740e1af13dff2684e4d56fe604a7e04d6c94e737a60568d8d4238b9a0c71", size = 60336, upload-time = "2026-03-06T02:54:18Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/f5/a2bb833e20181b937e87c242645ed5d5aa9c373006b0467bfe1a35c727d0/wrapt-2.1.2-cp310-cp310-win_arm64.whl", hash = "sha256:1c6cc827c00dc839350155f316f1f8b4b0c370f52b6a19e782e2bda89600c7dc", size = 58757, upload-time = "2026-03-06T02:53:51.545Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/81/60c4471fce95afa5922ca09b88a25f03c93343f759aae0f31fb4412a85c7/wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb", size = 60666, upload-time = "2026-03-06T02:52:58.934Z" },
+    { url = "https://files.pythonhosted.org/packages/6b/be/80e80e39e7cb90b006a0eaf11c73ac3a62bbfb3068469aec15cc0bc795de/wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d", size = 61601, upload-time = "2026-03-06T02:53:00.487Z" },
+    { url = "https://files.pythonhosted.org/packages/b0/be/d7c88cd9293c859fc74b232abdc65a229bb953997995d6912fc85af18323/wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894", size = 114057, upload-time = "2026-03-06T02:52:44.08Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/25/36c04602831a4d685d45a93b3abea61eca7fe35dab6c842d6f5d570ef94a/wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842", size = 116099, upload-time = "2026-03-06T02:54:56.74Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/4e/98a6eb417ef551dc277bec1253d5246b25003cf36fdf3913b65cb7657a56/wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8", size = 112457, upload-time = "2026-03-06T02:53:52.842Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/a6/a6f7186a5297cad8ec53fd7578533b28f795fdf5372368c74bd7e6e9841c/wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6", size = 115351, upload-time = "2026-03-06T02:53:32.684Z" },
+    { url = "https://files.pythonhosted.org/packages/97/6f/06e66189e721dbebd5cf20e138acc4d1150288ce118462f2fcbff92d38db/wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9", size = 111748, upload-time = "2026-03-06T02:53:08.455Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/43/4808b86f499a51370fbdbdfa6cb91e9b9169e762716456471b619fca7a70/wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15", size = 113783, upload-time = "2026-03-06T02:53:02.02Z" },
+    { url = "https://files.pythonhosted.org/packages/91/2c/a3f28b8fa7ac2cefa01cfcaca3471f9b0460608d012b693998cd61ef43df/wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b", size = 57977, upload-time = "2026-03-06T02:53:27.844Z" },
+    { url = "https://files.pythonhosted.org/packages/3f/c3/2b1c7bd07a27b1db885a2fab469b707bdd35bddf30a113b4917a7e2139d2/wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1", size = 60336, upload-time = "2026-03-06T02:54:28.104Z" },
+    { url = "https://files.pythonhosted.org/packages/ec/5c/76ece7b401b088daa6503d6264dd80f9a727df3e6042802de9a223084ea2/wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a", size = 58756, upload-time = "2026-03-06T02:53:16.319Z" },
     { url = "https://files.pythonhosted.org/packages/4c/b6/1db817582c49c7fcbb7df6809d0f515af29d7c2fbf57eb44c36e98fb1492/wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9", size = 61255, upload-time = "2026-03-06T02:52:45.663Z" },
     { url = "https://files.pythonhosted.org/packages/a2/16/9b02a6b99c09227c93cd4b73acc3678114154ec38da53043c0ddc1fba0dc/wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748", size = 61848, upload-time = "2026-03-06T02:53:48.728Z" },
     { url = "https://files.pythonhosted.org/packages/af/aa/ead46a88f9ec3a432a4832dfedb84092fc35af2d0ba40cd04aea3889f247/wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e", size = 121433, upload-time = "2026-03-06T02:54:40.328Z" },
@@ -7185,6 +8920,20 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/08/d5/25f7b19af3a2cb4000cac4f9e5525a40bec79f4f5d0ac9b517c0544586a0/xattr-1.3.0.tar.gz", hash = "sha256:30439fabd7de0787b27e9a6e1d569c5959854cb322f64ce7380fedbfa5035036", size = 17148, upload-time = "2025-10-13T22:16:47.353Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/ab/11/bbb25ab921e02efb789efcab5b7d03581b5d28f71d829f21e4ea6aba09fb/xattr-1.3.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a80c4617e08670cdc3ba71f1dbb275c1627744c5c3641280879cb3bc95a07237", size = 23453, upload-time = "2025-10-13T22:15:50.753Z" },
+    { url = "https://files.pythonhosted.org/packages/be/88/66021fdfbb2037a94fc5b16c1dce1894b8e9da7a1829e4be0b491b3f24ff/xattr-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:51cdaa359f5cd2861178ae01ea3647b56dbdfd98e724a8aa3c04f77123b78217", size = 18551, upload-time = "2025-10-13T22:15:51.961Z" },
+    { url = "https://files.pythonhosted.org/packages/be/f7/5dd21fcfc48487a59fcec33ffe02eb671f256424869e9aef87e33c65d95b/xattr-1.3.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2fea070768d7d2d25797817bea93bf0a6fda6449e88cfee8bb3d75de9ed11c7b", size = 18852, upload-time = "2025-10-13T22:15:53.104Z" },
+    { url = "https://files.pythonhosted.org/packages/af/2a/e29753ac17a92aadf27b9e16b1d600584d9f10acd0b399d2c06f47af2dff/xattr-1.3.0-cp310-cp310-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:69bca34be2d7a928389aff4e32f27857e1c62d04c91ec7c1519b1636870bd58f", size = 38547, upload-time = "2025-10-13T22:15:54.385Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/46/b2c9185d24b93542e4307ce30cd3d4eb6af8efdc843d98ff9f07fcb048d9/xattr-1.3.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:05f8e068409742d246babba60cff8310b2c577745491f498b08bf068e0c867a3", size = 38755, upload-time = "2025-10-13T22:15:55.738Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/0a/93cf1f03536bf38e8fd3fe57eb04124e4dfe2e16c0c5ced589d3360a1858/xattr-1.3.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:bbd06987102bc11f5cbd08b15d1029832b862cf5bc61780573fc0828812f01ca", size = 38052, upload-time = "2025-10-13T22:15:57.031Z" },
+    { url = "https://files.pythonhosted.org/packages/55/ad/60e43f7e1037cee671e14c2a283e3e7168b756c9938eba62f0616e6599aa/xattr-1.3.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b8589744116d2c37928b771c50383cb281675cd6dcfd740abfab6883e3d4af85", size = 37560, upload-time = "2025-10-13T22:15:58.295Z" },
+    { url = "https://files.pythonhosted.org/packages/8a/64/292426ad5653e72c6e1325bbff22868a20077290d967cebb9c0624ad08b6/xattr-1.3.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:331a51bf8f20c27822f44054b0d760588462d3ed472d5e52ba135cf0bea510e8", size = 23448, upload-time = "2025-10-13T22:15:59.229Z" },
+    { url = "https://files.pythonhosted.org/packages/63/84/6539fbe620da8e5927406e76b9c8abad8953025d5f578d792747c38a8c0e/xattr-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:196360f068b74fa0132a8c6001ce1333f095364b8f43b6fd8cdaf2f18741ef89", size = 18553, upload-time = "2025-10-13T22:16:00.151Z" },
+    { url = "https://files.pythonhosted.org/packages/cc/bb/c1c2e24a49f8d13ff878fb85aabc42ea1b2f98ce08d8205b9661d517a9cc/xattr-1.3.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:405d2e4911d37f2b9400fa501acd920fe0c97fe2b2ec252cb23df4b59c000811", size = 18848, upload-time = "2025-10-13T22:16:01.046Z" },
+    { url = "https://files.pythonhosted.org/packages/02/c2/a60aad150322b217dfe33695d8d9f32bc01e8f300641b6ba4b73f4b3c03f/xattr-1.3.0-cp311-cp311-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:4ae3a66ae1effd40994f64defeeaa97da369406485e60bfb421f2d781be3b75d", size = 38547, upload-time = "2025-10-13T22:16:01.973Z" },
+    { url = "https://files.pythonhosted.org/packages/c6/58/2eca142bad4ea0a2be6b58d3122d0acce310c4e53fa7defd168202772178/xattr-1.3.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:69cd3bfe779f7ba87abe6473fdfa428460cf9e78aeb7e390cfd737b784edf1b5", size = 38753, upload-time = "2025-10-13T22:16:03.244Z" },
+    { url = "https://files.pythonhosted.org/packages/2b/50/d032e5254c2c27d36bdb02abdf2735db6768a441f0e3d0f139e0f9f56638/xattr-1.3.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c5742ca61761a99ae0c522f90a39d5fb8139280f27b254e3128482296d1df2db", size = 38054, upload-time = "2025-10-13T22:16:04.656Z" },
+    { url = "https://files.pythonhosted.org/packages/04/24/458a306439aabe0083ca0a7b14c3e6a800ab9782b5ec0bdcec4ec9f3dc6c/xattr-1.3.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4a04ada131e9bdfd32db3ab1efa9f852646f4f7c9d6fde0596c3825c67161be3", size = 37562, upload-time = "2025-10-13T22:16:05.97Z" },
     { url = "https://files.pythonhosted.org/packages/bf/78/00bdc9290066173e53e1e734d8d8e1a84a6faa9c66aee9df81e4d9aeec1c/xattr-1.3.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:dd4e63614722d183e81842cb237fd1cc978d43384166f9fe22368bfcb187ebe5", size = 23476, upload-time = "2025-10-13T22:16:06.942Z" },
     { url = "https://files.pythonhosted.org/packages/53/16/5243722294eb982514fa7b6b87a29dfb7b29b8e5e1486500c5babaf6e4b3/xattr-1.3.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:995843ef374af73e3370b0c107319611f3cdcdb6d151d629449efecad36be4c4", size = 18556, upload-time = "2025-10-13T22:16:08.209Z" },
     { url = "https://files.pythonhosted.org/packages/d6/5c/d7ab0e547bea885b55f097206459bd612cefb652c5fc1f747130cbc0d42c/xattr-1.3.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fa23a25220e29d956cedf75746e3df6cc824cc1553326d6516479967c540e386", size = 18869, upload-time = "2025-10-13T22:16:10.319Z" },
@@ -7221,6 +8970,36 @@ version = "3.6.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160, upload-time = "2025-10-02T14:37:08.097Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/34/ee/f9f1d656ad168681bb0f6b092372c1e533c4416b8069b1896a175c46e484/xxhash-3.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:87ff03d7e35c61435976554477a7f4cd1704c3596a89a8300d5ce7fc83874a71", size = 32845, upload-time = "2025-10-02T14:33:51.573Z" },
+    { url = "https://files.pythonhosted.org/packages/a3/b1/93508d9460b292c74a09b83d16750c52a0ead89c51eea9951cb97a60d959/xxhash-3.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f572dfd3d0e2eb1a57511831cf6341242f5a9f8298a45862d085f5b93394a27d", size = 30807, upload-time = "2025-10-02T14:33:52.964Z" },
+    { url = "https://files.pythonhosted.org/packages/07/55/28c93a3662f2d200c70704efe74aab9640e824f8ce330d8d3943bf7c9b3c/xxhash-3.6.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:89952ea539566b9fed2bbd94e589672794b4286f342254fad28b149f9615fef8", size = 193786, upload-time = "2025-10-02T14:33:54.272Z" },
+    { url = "https://files.pythonhosted.org/packages/c1/96/fec0be9bb4b8f5d9c57d76380a366f31a1781fb802f76fc7cda6c84893c7/xxhash-3.6.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48e6f2ffb07a50b52465a1032c3cf1f4a5683f944acaca8a134a2f23674c2058", size = 212830, upload-time = "2025-10-02T14:33:55.706Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/a0/c706845ba77b9611f81fd2e93fad9859346b026e8445e76f8c6fd057cc6d/xxhash-3.6.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b5b848ad6c16d308c3ac7ad4ba6bede80ed5df2ba8ed382f8932df63158dd4b2", size = 211606, upload-time = "2025-10-02T14:33:57.133Z" },
+    { url = "https://files.pythonhosted.org/packages/67/1e/164126a2999e5045f04a69257eea946c0dc3e86541b400d4385d646b53d7/xxhash-3.6.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:a034590a727b44dd8ac5914236a7b8504144447a9682586c3327e935f33ec8cc", size = 444872, upload-time = "2025-10-02T14:33:58.446Z" },
+    { url = "https://files.pythonhosted.org/packages/2d/4b/55ab404c56cd70a2cf5ecfe484838865d0fea5627365c6c8ca156bd09c8f/xxhash-3.6.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8a8f1972e75ebdd161d7896743122834fe87378160c20e97f8b09166213bf8cc", size = 193217, upload-time = "2025-10-02T14:33:59.724Z" },
+    { url = "https://files.pythonhosted.org/packages/45/e6/52abf06bac316db33aa269091ae7311bd53cfc6f4b120ae77bac1b348091/xxhash-3.6.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ee34327b187f002a596d7b167ebc59a1b729e963ce645964bbc050d2f1b73d07", size = 210139, upload-time = "2025-10-02T14:34:02.041Z" },
+    { url = "https://files.pythonhosted.org/packages/34/37/db94d490b8691236d356bc249c08819cbcef9273a1a30acf1254ff9ce157/xxhash-3.6.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:339f518c3c7a850dd033ab416ea25a692759dc7478a71131fe8869010d2b75e4", size = 197669, upload-time = "2025-10-02T14:34:03.664Z" },
+    { url = "https://files.pythonhosted.org/packages/b7/36/c4f219ef4a17a4f7a64ed3569bc2b5a9c8311abdb22249ac96093625b1a4/xxhash-3.6.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:bf48889c9630542d4709192578aebbd836177c9f7a4a2778a7d6340107c65f06", size = 210018, upload-time = "2025-10-02T14:34:05.325Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/06/bfac889a374fc2fc439a69223d1750eed2e18a7db8514737ab630534fa08/xxhash-3.6.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:5576b002a56207f640636056b4160a378fe36a58db73ae5c27a7ec8db35f71d4", size = 413058, upload-time = "2025-10-02T14:34:06.925Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/d1/555d8447e0dd32ad0930a249a522bb2e289f0d08b6b16204cfa42c1f5a0c/xxhash-3.6.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:af1f3278bd02814d6dedc5dec397993b549d6f16c19379721e5a1d31e132c49b", size = 190628, upload-time = "2025-10-02T14:34:08.669Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/15/8751330b5186cedc4ed4b597989882ea05e0408b53fa47bcb46a6125bfc6/xxhash-3.6.0-cp310-cp310-win32.whl", hash = "sha256:aed058764db109dc9052720da65fafe84873b05eb8b07e5e653597951af57c3b", size = 30577, upload-time = "2025-10-02T14:34:10.234Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/cc/53f87e8b5871a6eb2ff7e89c48c66093bda2be52315a8161ddc54ea550c4/xxhash-3.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:e82da5670f2d0d98950317f82a0e4a0197150ff19a6df2ba40399c2a3b9ae5fb", size = 31487, upload-time = "2025-10-02T14:34:11.618Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/00/60f9ea3bb697667a14314d7269956f58bf56bb73864f8f8d52a3c2535e9a/xxhash-3.6.0-cp310-cp310-win_arm64.whl", hash = "sha256:4a082ffff8c6ac07707fb6b671caf7c6e020c75226c561830b73d862060f281d", size = 27863, upload-time = "2025-10-02T14:34:12.619Z" },
+    { url = "https://files.pythonhosted.org/packages/17/d4/cc2f0400e9154df4b9964249da78ebd72f318e35ccc425e9f403c392f22a/xxhash-3.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b47bbd8cf2d72797f3c2772eaaac0ded3d3af26481a26d7d7d41dc2d3c46b04a", size = 32844, upload-time = "2025-10-02T14:34:14.037Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/ec/1cc11cd13e26ea8bc3cb4af4eaadd8d46d5014aebb67be3f71fb0b68802a/xxhash-3.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2b6821e94346f96db75abaa6e255706fb06ebd530899ed76d32cd99f20dc52fa", size = 30809, upload-time = "2025-10-02T14:34:15.484Z" },
+    { url = "https://files.pythonhosted.org/packages/04/5f/19fe357ea348d98ca22f456f75a30ac0916b51c753e1f8b2e0e6fb884cce/xxhash-3.6.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:d0a9751f71a1a65ce3584e9cae4467651c7e70c9d31017fa57574583a4540248", size = 194665, upload-time = "2025-10-02T14:34:16.541Z" },
+    { url = "https://files.pythonhosted.org/packages/90/3b/d1f1a8f5442a5fd8beedae110c5af7604dc37349a8e16519c13c19a9a2de/xxhash-3.6.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b29ee68625ab37b04c0b40c3fafdf24d2f75ccd778333cfb698f65f6c463f62", size = 213550, upload-time = "2025-10-02T14:34:17.878Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/ef/3a9b05eb527457d5db13a135a2ae1a26c80fecd624d20f3e8dcc4cb170f3/xxhash-3.6.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6812c25fe0d6c36a46ccb002f40f27ac903bf18af9f6dd8f9669cb4d176ab18f", size = 212384, upload-time = "2025-10-02T14:34:19.182Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/18/ccc194ee698c6c623acbf0f8c2969811a8a4b6185af5e824cd27b9e4fd3e/xxhash-3.6.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4ccbff013972390b51a18ef1255ef5ac125c92dc9143b2d1909f59abc765540e", size = 445749, upload-time = "2025-10-02T14:34:20.659Z" },
+    { url = "https://files.pythonhosted.org/packages/a5/86/cf2c0321dc3940a7aa73076f4fd677a0fb3e405cb297ead7d864fd90847e/xxhash-3.6.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:297b7fbf86c82c550e12e8fb71968b3f033d27b874276ba3624ea868c11165a8", size = 193880, upload-time = "2025-10-02T14:34:22.431Z" },
+    { url = "https://files.pythonhosted.org/packages/82/fb/96213c8560e6f948a1ecc9a7613f8032b19ee45f747f4fca4eb31bb6d6ed/xxhash-3.6.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dea26ae1eb293db089798d3973a5fc928a18fdd97cc8801226fae705b02b14b0", size = 210912, upload-time = "2025-10-02T14:34:23.937Z" },
+    { url = "https://files.pythonhosted.org/packages/40/aa/4395e669b0606a096d6788f40dbdf2b819d6773aa290c19e6e83cbfc312f/xxhash-3.6.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7a0b169aafb98f4284f73635a8e93f0735f9cbde17bd5ec332480484241aaa77", size = 198654, upload-time = "2025-10-02T14:34:25.644Z" },
+    { url = "https://files.pythonhosted.org/packages/67/74/b044fcd6b3d89e9b1b665924d85d3f400636c23590226feb1eb09e1176ce/xxhash-3.6.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:08d45aef063a4531b785cd72de4887766d01dc8f362a515693df349fdb825e0c", size = 210867, upload-time = "2025-10-02T14:34:27.203Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/fd/3ce73bf753b08cb19daee1eb14aa0d7fe331f8da9c02dd95316ddfe5275e/xxhash-3.6.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:929142361a48ee07f09121fe9e96a84950e8d4df3bb298ca5d88061969f34d7b", size = 414012, upload-time = "2025-10-02T14:34:28.409Z" },
+    { url = "https://files.pythonhosted.org/packages/ba/b3/5a4241309217c5c876f156b10778f3ab3af7ba7e3259e6d5f5c7d0129eb2/xxhash-3.6.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:51312c768403d8540487dbbfb557454cfc55589bbde6424456951f7fcd4facb3", size = 191409, upload-time = "2025-10-02T14:34:29.696Z" },
+    { url = "https://files.pythonhosted.org/packages/c0/01/99bfbc15fb9abb9a72b088c1d95219fc4782b7d01fc835bd5744d66dd0b8/xxhash-3.6.0-cp311-cp311-win32.whl", hash = "sha256:d1927a69feddc24c987b337ce81ac15c4720955b667fe9b588e02254b80446fd", size = 30574, upload-time = "2025-10-02T14:34:31.028Z" },
+    { url = "https://files.pythonhosted.org/packages/65/79/9d24d7f53819fe301b231044ea362ce64e86c74f6e8c8e51320de248b3e5/xxhash-3.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:26734cdc2d4ffe449b41d186bbeac416f704a482ed835d375a5c0cb02bc63fef", size = 31481, upload-time = "2025-10-02T14:34:32.062Z" },
+    { url = "https://files.pythonhosted.org/packages/30/4e/15cd0e3e8772071344eab2961ce83f6e485111fed8beb491a3f1ce100270/xxhash-3.6.0-cp311-cp311-win_arm64.whl", hash = "sha256:d72f67ef8bf36e05f5b6c65e8524f265bd61071471cd4cf1d36743ebeeeb06b7", size = 27861, upload-time = "2025-10-02T14:34:33.555Z" },
     { url = "https://files.pythonhosted.org/packages/9a/07/d9412f3d7d462347e4511181dea65e47e0d0e16e26fbee2ea86a2aefb657/xxhash-3.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:01362c4331775398e7bb34e3ab403bc9ee9f7c497bc7dee6272114055277dd3c", size = 32744, upload-time = "2025-10-02T14:34:34.622Z" },
     { url = "https://files.pythonhosted.org/packages/79/35/0429ee11d035fc33abe32dca1b2b69e8c18d236547b9a9b72c1929189b9a/xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7b2df81a23f8cb99656378e72501b2cb41b1827c0f5a86f87d6b06b69f9f204", size = 30816, upload-time = "2025-10-02T14:34:36.043Z" },
     { url = "https://files.pythonhosted.org/packages/b7/f2/57eb99aa0f7d98624c0932c5b9a170e1806406cdbcdb510546634a1359e0/xxhash-3.6.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dc94790144e66b14f67b10ac8ed75b39ca47536bf8800eb7c24b50271ea0c490", size = 194035, upload-time = "2025-10-02T14:34:37.354Z" },
@@ -7296,6 +9075,11 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/9a/9a/c19c42c5b3f5a4aad748a6d5b4f23df3bed7ee5445accc65a0fb3ff03953/xxhash-3.6.0-cp314-cp314t-win32.whl", hash = "sha256:5851f033c3030dd95c086b4a36a2683c2ff4a799b23af60977188b057e467119", size = 31586, upload-time = "2025-10-02T14:36:15.603Z" },
     { url = "https://files.pythonhosted.org/packages/03/d6/4cc450345be9924fd5dc8c590ceda1db5b43a0a889587b0ae81a95511360/xxhash-3.6.0-cp314-cp314t-win_amd64.whl", hash = "sha256:0444e7967dac37569052d2409b00a8860c2135cff05502df4da80267d384849f", size = 32526, upload-time = "2025-10-02T14:36:16.708Z" },
     { url = "https://files.pythonhosted.org/packages/0f/c9/7243eb3f9eaabd1a88a5a5acadf06df2d83b100c62684b7425c6a11bcaa8/xxhash-3.6.0-cp314-cp314t-win_arm64.whl", hash = "sha256:bb79b1e63f6fd84ec778a4b1916dfe0a7c3fdb986c06addd5db3a0d413819d95", size = 28898, upload-time = "2025-10-02T14:36:17.843Z" },
+    { url = "https://files.pythonhosted.org/packages/93/1e/8aec23647a34a249f62e2398c42955acd9b4c6ed5cf08cbea94dc46f78d2/xxhash-3.6.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:0f7b7e2ec26c1666ad5fc9dbfa426a6a3367ceaf79db5dd76264659d509d73b0", size = 30662, upload-time = "2025-10-02T14:37:01.743Z" },
+    { url = "https://files.pythonhosted.org/packages/b8/0b/b14510b38ba91caf43006209db846a696ceea6a847a0c9ba0a5b1adc53d6/xxhash-3.6.0-pp311-pypy311_pp73-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:5dc1e14d14fa0f5789ec29a7062004b5933964bb9b02aae6622b8f530dc40296", size = 41056, upload-time = "2025-10-02T14:37:02.879Z" },
+    { url = "https://files.pythonhosted.org/packages/50/55/15a7b8a56590e66ccd374bbfa3f9ffc45b810886c8c3b614e3f90bd2367c/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:881b47fc47e051b37d94d13e7455131054b56749b91b508b0907eb07900d1c13", size = 36251, upload-time = "2025-10-02T14:37:04.44Z" },
+    { url = "https://files.pythonhosted.org/packages/62/b2/5ac99a041a29e58e95f907876b04f7067a0242cb85b5f39e726153981503/xxhash-3.6.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c6dc31591899f5e5666f04cc2e529e69b4072827085c1ef15294d91a004bc1bd", size = 32481, upload-time = "2025-10-02T14:37:05.869Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/d9/8d95e906764a386a3d3b596f3c68bb63687dfca806373509f51ce8eea81f/xxhash-3.6.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:15e0dac10eb9309508bfc41f7f9deaa7755c69e35af835db9cb10751adebc35d", size = 31565, upload-time = "2025-10-02T14:37:06.966Z" },
 ]
 
 [[package]]
@@ -7309,6 +9093,42 @@ dependencies = [
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/23/6e/beb1beec874a72f23815c1434518bfc4ed2175065173fb138c3705f658d4/yarl-1.23.0.tar.gz", hash = "sha256:53b1ea6ca88ebd4420379c330aea57e258408dd0df9af0992e5de2078dc9f5d5", size = 194676, upload-time = "2026-03-01T22:07:53.373Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/8b/0d/9cc638702f6fc3c7a3685bcc8cf2a9ed7d6206e932a49f5242658047ef51/yarl-1.23.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:cff6d44cb13d39db2663a22b22305d10855efa0fa8015ddeacc40bc59b9d8107", size = 123764, upload-time = "2026-03-01T22:04:09.7Z" },
+    { url = "https://files.pythonhosted.org/packages/7a/35/5a553687c5793df5429cd1db45909d4f3af7eee90014888c208d086a44f0/yarl-1.23.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c53f8347cd4200f0d70a48ad059cabaf24f5adc6ba08622a23423bc7efa10d", size = 86282, upload-time = "2026-03-01T22:04:11.892Z" },
+    { url = "https://files.pythonhosted.org/packages/68/2e/c5a2234238f8ce37a8312b52801ee74117f576b1539eec8404a480434acc/yarl-1.23.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2a6940a074fb3c48356ed0158a3ca5699c955ee4185b4d7d619be3c327143e05", size = 86053, upload-time = "2026-03-01T22:04:13.292Z" },
+    { url = "https://files.pythonhosted.org/packages/74/3f/bbd8ff36fb038622797ffbaf7db314918bb4d76f1cc8a4f9ca7a55fe5195/yarl-1.23.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ed5f69ce7be7902e5c70ea19eb72d20abf7d725ab5d49777d696e32d4fc1811d", size = 99395, upload-time = "2026-03-01T22:04:15.133Z" },
+    { url = "https://files.pythonhosted.org/packages/77/04/9516bc4e269d2a3ec9c6779fcdeac51ce5b3a9b0156f06ac7152e5bba864/yarl-1.23.0-cp310-cp310-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:389871e65468400d6283c0308e791a640b5ab5c83bcee02a2f51295f95e09748", size = 92143, upload-time = "2026-03-01T22:04:16.829Z" },
+    { url = "https://files.pythonhosted.org/packages/c7/63/88802d1f6b1cb1fc67d67a58cd0cf8a1790de4ce7946e434240f1d60ab4a/yarl-1.23.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:dda608c88cf709b1d406bdfcd84d8d63cff7c9e577a403c6108ce8ce9dcc8764", size = 107643, upload-time = "2026-03-01T22:04:18.519Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/db/4f9b838f4d8bdd6f0f385aed8bbf21c71ed11a0b9983305c302cbd557815/yarl-1.23.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8c4fe09e0780c6c3bf2b7d4af02ee2394439d11a523bbcf095cf4747c2932007", size = 108700, upload-time = "2026-03-01T22:04:20.373Z" },
+    { url = "https://files.pythonhosted.org/packages/50/12/95a1d33f04a79c402664070d43b8b9f72dc18914e135b345b611b0b1f8cc/yarl-1.23.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:31c9921eb8bd12633b41ad27686bbb0b1a2a9b8452bfdf221e34f311e9942ed4", size = 102769, upload-time = "2026-03-01T22:04:23.055Z" },
+    { url = "https://files.pythonhosted.org/packages/86/65/91a0285f51321369fd1a8308aa19207520c5f0587772cfc2e03fc2467e90/yarl-1.23.0-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:5f10fd85e4b75967468af655228fbfd212bdf66db1c0d135065ce288982eda26", size = 101114, upload-time = "2026-03-01T22:04:25.031Z" },
+    { url = "https://files.pythonhosted.org/packages/58/80/c7c8244fc3e5bc483dc71a09560f43b619fab29301a0f0a8f936e42865c7/yarl-1.23.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dbf507e9ef5688bada447a24d68b4b58dd389ba93b7afc065a2ba892bea54769", size = 98883, upload-time = "2026-03-01T22:04:27.281Z" },
+    { url = "https://files.pythonhosted.org/packages/86/e7/71ca9cc9ca79c0b7d491216177d1aed559d632947b8ffb0ee60f7d8b23e3/yarl-1.23.0-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:85e9beda1f591bc73e77ea1c51965c68e98dafd0fec72cdd745f77d727466716", size = 94172, upload-time = "2026-03-01T22:04:28.554Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/3f/6c6c8a0fe29c26fb2db2e8d32195bb84ec1bfb8f1d32e7f73b787fcf349b/yarl-1.23.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:0e1fdaa14ef51366d7757b45bde294e95f6c8c049194e793eedb8387c86d5993", size = 107010, upload-time = "2026-03-01T22:04:30.385Z" },
+    { url = "https://files.pythonhosted.org/packages/56/38/12730c05e5ad40a76374d440ed8b0899729a96c250516d91c620a6e38fc2/yarl-1.23.0-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:75e3026ab649bf48f9a10c0134512638725b521340293f202a69b567518d94e0", size = 100285, upload-time = "2026-03-01T22:04:31.752Z" },
+    { url = "https://files.pythonhosted.org/packages/34/92/6a7be9239f2347234e027284e7a5f74b1140cc86575e7b469d13fba1ebfe/yarl-1.23.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:80e6d33a3d42a7549b409f199857b4fb54e2103fc44fb87605b6663b7a7ff750", size = 108230, upload-time = "2026-03-01T22:04:33.844Z" },
+    { url = "https://files.pythonhosted.org/packages/5e/81/4aebccfa9376bd98b9d8bfad20621a57d3e8cfc5b8631c1fa5f62cdd03f4/yarl-1.23.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5ec2f42d41ccbd5df0270d7df31618a8ee267bfa50997f5d720ddba86c4a83a6", size = 103008, upload-time = "2026-03-01T22:04:35.856Z" },
+    { url = "https://files.pythonhosted.org/packages/38/0f/0b4e3edcec794a86b853b0c6396c0a888d72dfce19b2d88c02ac289fb6c1/yarl-1.23.0-cp310-cp310-win32.whl", hash = "sha256:debe9c4f41c32990771be5c22b56f810659f9ddf3d63f67abfdcaa2c6c9c5c1d", size = 83073, upload-time = "2026-03-01T22:04:38.268Z" },
+    { url = "https://files.pythonhosted.org/packages/a0/71/ad95c33da18897e4c636528bbc24a1dd23fe16797de8bc4ec667b8db0ba4/yarl-1.23.0-cp310-cp310-win_amd64.whl", hash = "sha256:ab5f043cb8a2d71c981c09c510da013bc79fd661f5c60139f00dd3c3cc4f2ffb", size = 87328, upload-time = "2026-03-01T22:04:39.558Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/14/dfa369523c79bccf9c9c746b0a63eb31f65db9418ac01275f7950962e504/yarl-1.23.0-cp310-cp310-win_arm64.whl", hash = "sha256:263cd4f47159c09b8b685890af949195b51d1aa82ba451c5847ca9bc6413c220", size = 82463, upload-time = "2026-03-01T22:04:41.454Z" },
+    { url = "https://files.pythonhosted.org/packages/a2/aa/60da938b8f0997ba3a911263c40d82b6f645a67902a490b46f3355e10fae/yarl-1.23.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:b35d13d549077713e4414f927cdc388d62e543987c572baee613bf82f11a4b99", size = 123641, upload-time = "2026-03-01T22:04:42.841Z" },
+    { url = "https://files.pythonhosted.org/packages/24/84/e237607faf4e099dbb8a4f511cfd5efcb5f75918baad200ff7380635631b/yarl-1.23.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cbb0fef01f0c6b38cb0f39b1f78fc90b807e0e3c86a7ff3ce74ad77ce5c7880c", size = 86248, upload-time = "2026-03-01T22:04:44.757Z" },
+    { url = "https://files.pythonhosted.org/packages/b2/0d/71ceabc14c146ba8ee3804ca7b3d42b1664c8440439de5214d366fec7d3a/yarl-1.23.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dc52310451fc7c629e13c4e061cbe2dd01684d91f2f8ee2821b083c58bd72432", size = 85988, upload-time = "2026-03-01T22:04:46.365Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/6c/4a90d59c572e46b270ca132aca66954f1175abd691f74c1ef4c6711828e2/yarl-1.23.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b2c6b50c7b0464165472b56b42d4c76a7b864597007d9c085e8b63e185cf4a7a", size = 100566, upload-time = "2026-03-01T22:04:47.639Z" },
+    { url = "https://files.pythonhosted.org/packages/49/fb/c438fb5108047e629f6282a371e6e91cf3f97ee087c4fb748a1f32ceef55/yarl-1.23.0-cp311-cp311-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:aafe5dcfda86c8af00386d7781d4c2181b5011b7be3f2add5e99899ea925df05", size = 92079, upload-time = "2026-03-01T22:04:48.925Z" },
+    { url = "https://files.pythonhosted.org/packages/d9/13/d269aa1aed3e4f50a5a103f96327210cc5fa5dd2d50882778f13c7a14606/yarl-1.23.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9ee33b875f0b390564c1fb7bc528abf18c8ee6073b201c6ae8524aca778e2d83", size = 108741, upload-time = "2026-03-01T22:04:50.838Z" },
+    { url = "https://files.pythonhosted.org/packages/85/fb/115b16f22c37ea4437d323e472945bea97301c8ec6089868fa560abab590/yarl-1.23.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:4c41e021bc6d7affb3364dc1e1e5fa9582b470f283748784bd6ea0558f87f42c", size = 108099, upload-time = "2026-03-01T22:04:52.499Z" },
+    { url = "https://files.pythonhosted.org/packages/9a/64/c53487d9f4968045b8afa51aed7ca44f58b2589e772f32745f3744476c82/yarl-1.23.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:99c8a9ed30f4164bc4c14b37a90208836cbf50d4ce2a57c71d0f52c7fb4f7598", size = 102678, upload-time = "2026-03-01T22:04:55.176Z" },
+    { url = "https://files.pythonhosted.org/packages/85/59/cd98e556fbb2bf8fab29c1a722f67ad45c5f3447cac798ab85620d1e70af/yarl-1.23.0-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f2af5c81a1f124609d5f33507082fc3f739959d4719b56877ab1ee7e7b3d602b", size = 100803, upload-time = "2026-03-01T22:04:56.588Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/c0/b39770b56d4a9f0bb5f77e2f1763cd2d75cc2f6c0131e3b4c360348fcd65/yarl-1.23.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6b41389c19b07c760c7e427a3462e8ab83c4bb087d127f0e854c706ce1b9215c", size = 100163, upload-time = "2026-03-01T22:04:58.492Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/64/6980f99ab00e1f0ff67cb84766c93d595b067eed07439cfccfc8fb28c1a6/yarl-1.23.0-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:1dc702e42d0684f42d6519c8d581e49c96cefaaab16691f03566d30658ee8788", size = 93859, upload-time = "2026-03-01T22:05:00.268Z" },
+    { url = "https://files.pythonhosted.org/packages/38/69/912e6c5e146793e5d4b5fe39ff5b00f4d22463dfd5a162bec565ac757673/yarl-1.23.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:0e40111274f340d32ebcc0a5668d54d2b552a6cca84c9475859d364b380e3222", size = 108202, upload-time = "2026-03-01T22:05:02.273Z" },
+    { url = "https://files.pythonhosted.org/packages/59/97/35ca6767524687ad64e5f5c31ad54bc76d585585a9fcb40f649e7e82ffed/yarl-1.23.0-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:4764a6a7588561a9aef92f65bda2c4fb58fe7c675c0883862e6df97559de0bfb", size = 99866, upload-time = "2026-03-01T22:05:03.597Z" },
+    { url = "https://files.pythonhosted.org/packages/d3/1c/1a3387ee6d73589f6f2a220ae06f2984f6c20b40c734989b0a44f5987308/yarl-1.23.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:03214408cfa590df47728b84c679ae4ef00be2428e11630277be0727eba2d7cc", size = 107852, upload-time = "2026-03-01T22:05:04.986Z" },
+    { url = "https://files.pythonhosted.org/packages/a4/b8/35c0750fcd5a3f781058bfd954515dd4b1eab45e218cbb85cf11132215f1/yarl-1.23.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:170e26584b060879e29fac213e4228ef063f39128723807a312e5c7fec28eff2", size = 102919, upload-time = "2026-03-01T22:05:06.397Z" },
+    { url = "https://files.pythonhosted.org/packages/e5/1c/9a1979aec4a81896d597bcb2177827f2dbee3f5b7cc48b2d0dadb644b41d/yarl-1.23.0-cp311-cp311-win32.whl", hash = "sha256:51430653db848d258336cfa0244427b17d12db63d42603a55f0d4546f50f25b5", size = 82602, upload-time = "2026-03-01T22:05:08.444Z" },
+    { url = "https://files.pythonhosted.org/packages/93/22/b85eca6fa2ad9491af48c973e4c8cf6b103a73dbb271fe3346949449fca0/yarl-1.23.0-cp311-cp311-win_amd64.whl", hash = "sha256:bf49a3ae946a87083ef3a34c8f677ae4243f5b824bfc4c69672e72b3d6719d46", size = 87461, upload-time = "2026-03-01T22:05:10.145Z" },
+    { url = "https://files.pythonhosted.org/packages/93/95/07e3553fe6f113e6864a20bdc53a78113cda3b9ced8784ee52a52c9f80d8/yarl-1.23.0-cp311-cp311-win_arm64.whl", hash = "sha256:b39cb32a6582750b6cc77bfb3c49c0f8760dc18dc96ec9fb55fbb0f04e08b928", size = 82336, upload-time = "2026-03-01T22:05:11.554Z" },
     { url = "https://files.pythonhosted.org/packages/88/8a/94615bc31022f711add374097ad4144d569e95ff3c38d39215d07ac153a0/yarl-1.23.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:1932b6b8bba8d0160a9d1078aae5838a66039e8832d41d2992daa9a3a08f7860", size = 124737, upload-time = "2026-03-01T22:05:12.897Z" },
     { url = "https://files.pythonhosted.org/packages/e3/6f/c6554045d59d64052698add01226bc867b52fe4a12373415d7991fdca95d/yarl-1.23.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:411225bae281f114067578891bc75534cfb3d92a3b4dfef7a6ca78ba354e6069", size = 87029, upload-time = "2026-03-01T22:05:14.376Z" },
     { url = "https://files.pythonhosted.org/packages/19/2a/725ecc166d53438bc88f76822ed4b1e3b10756e790bafd7b523fe97c322d/yarl-1.23.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13a563739ae600a631c36ce096615fe307f131344588b0bc0daec108cdb47b25", size = 86310, upload-time = "2026-03-01T22:05:15.71Z" },
@@ -7404,11 +9224,11 @@ wheels = [
 
 [[package]]
 name = "zipp"
-version = "3.23.1"
+version = "3.23.0"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/30/21/093488dfc7cc8964ded15ab726fad40f25fd3d788fd741cc1c5a17d78ee8/zipp-3.23.1.tar.gz", hash = "sha256:32120e378d32cd9714ad503c1d024619063ec28aad2248dc6672ad13edfa5110", size = 25965, upload-time = "2026-04-13T23:21:46.6Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e3/02/0f2892c661036d50ede074e376733dca2ae7c6eb617489437771209d4180/zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166", size = 25547, upload-time = "2025-06-08T17:06:39.4Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/08/8a/0861bec20485572fbddf3dfba2910e38fe249796cb73ecdeb74e07eeb8d3/zipp-3.23.1-py3-none-any.whl", hash = "sha256:0b3596c50a5c700c9cb40ba8d86d9f2cc4807e9bedb06bcdf7fac85633e444dc", size = 10378, upload-time = "2026-04-13T23:21:45.386Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/54/647ade08bf0db230bfea292f893923872fd20be6ac6f53b2b936ba839d75/zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e", size = 10276, upload-time = "2025-06-08T17:06:38.034Z" },
 ]
 
 [[package]]
@@ -7417,6 +9237,39 @@ version = "0.25.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" }
 wheels = [
+    { url = "https://files.pythonhosted.org/packages/56/7a/28efd1d371f1acd037ac64ed1c5e2b41514a6cc937dd6ab6a13ab9f0702f/zstandard-0.25.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e59fdc271772f6686e01e1b3b74537259800f57e24280be3f29c8a0deb1904dd", size = 795256, upload-time = "2025-09-14T22:15:56.415Z" },
+    { url = "https://files.pythonhosted.org/packages/96/34/ef34ef77f1ee38fc8e4f9775217a613b452916e633c4f1d98f31db52c4a5/zstandard-0.25.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4d441506e9b372386a5271c64125f72d5df6d2a8e8a2a45a0ae09b03cb781ef7", size = 640565, upload-time = "2025-09-14T22:15:58.177Z" },
+    { url = "https://files.pythonhosted.org/packages/9d/1b/4fdb2c12eb58f31f28c4d28e8dc36611dd7205df8452e63f52fb6261d13e/zstandard-0.25.0-cp310-cp310-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:ab85470ab54c2cb96e176f40342d9ed41e58ca5733be6a893b730e7af9c40550", size = 5345306, upload-time = "2025-09-14T22:16:00.165Z" },
+    { url = "https://files.pythonhosted.org/packages/73/28/a44bdece01bca027b079f0e00be3b6bd89a4df180071da59a3dd7381665b/zstandard-0.25.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e05ab82ea7753354bb054b92e2f288afb750e6b439ff6ca78af52939ebbc476d", size = 5055561, upload-time = "2025-09-14T22:16:02.22Z" },
+    { url = "https://files.pythonhosted.org/packages/e9/74/68341185a4f32b274e0fc3410d5ad0750497e1acc20bd0f5b5f64ce17785/zstandard-0.25.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:78228d8a6a1c177a96b94f7e2e8d012c55f9c760761980da16ae7546a15a8e9b", size = 5402214, upload-time = "2025-09-14T22:16:04.109Z" },
+    { url = "https://files.pythonhosted.org/packages/8b/67/f92e64e748fd6aaffe01e2b75a083c0c4fd27abe1c8747fee4555fcee7dd/zstandard-0.25.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:2b6bd67528ee8b5c5f10255735abc21aa106931f0dbaf297c7be0c886353c3d0", size = 5449703, upload-time = "2025-09-14T22:16:06.312Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/e5/6d36f92a197c3c17729a2125e29c169f460538a7d939a27eaaa6dcfcba8e/zstandard-0.25.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4b6d83057e713ff235a12e73916b6d356e3084fd3d14ced499d84240f3eecee0", size = 5556583, upload-time = "2025-09-14T22:16:08.457Z" },
+    { url = "https://files.pythonhosted.org/packages/d7/83/41939e60d8d7ebfe2b747be022d0806953799140a702b90ffe214d557638/zstandard-0.25.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9174f4ed06f790a6869b41cba05b43eeb9a35f8993c4422ab853b705e8112bbd", size = 5045332, upload-time = "2025-09-14T22:16:10.444Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/87/d3ee185e3d1aa0133399893697ae91f221fda79deb61adbe998a7235c43f/zstandard-0.25.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:25f8f3cd45087d089aef5ba3848cd9efe3ad41163d3400862fb42f81a3a46701", size = 5572283, upload-time = "2025-09-14T22:16:12.128Z" },
+    { url = "https://files.pythonhosted.org/packages/0a/1d/58635ae6104df96671076ac7d4ae7816838ce7debd94aecf83e30b7121b0/zstandard-0.25.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3756b3e9da9b83da1796f8809dd57cb024f838b9eeafde28f3cb472012797ac1", size = 4959754, upload-time = "2025-09-14T22:16:14.225Z" },
+    { url = "https://files.pythonhosted.org/packages/75/d6/57e9cb0a9983e9a229dd8fd2e6e96593ef2aa82a3907188436f22b111ccd/zstandard-0.25.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:81dad8d145d8fd981b2962b686b2241d3a1ea07733e76a2f15435dfb7fb60150", size = 5266477, upload-time = "2025-09-14T22:16:16.343Z" },
+    { url = "https://files.pythonhosted.org/packages/d1/a9/ee891e5edf33a6ebce0a028726f0bbd8567effe20fe3d5808c42323e8542/zstandard-0.25.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a5a419712cf88862a45a23def0ae063686db3d324cec7edbe40509d1a79a0aab", size = 5440914, upload-time = "2025-09-14T22:16:18.453Z" },
+    { url = "https://files.pythonhosted.org/packages/58/08/a8522c28c08031a9521f27abc6f78dbdee7312a7463dd2cfc658b813323b/zstandard-0.25.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e7360eae90809efd19b886e59a09dad07da4ca9ba096752e61a2e03c8aca188e", size = 5819847, upload-time = "2025-09-14T22:16:20.559Z" },
+    { url = "https://files.pythonhosted.org/packages/6f/11/4c91411805c3f7b6f31c60e78ce347ca48f6f16d552fc659af6ec3b73202/zstandard-0.25.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:75ffc32a569fb049499e63ce68c743155477610532da1eb38e7f24bf7cd29e74", size = 5363131, upload-time = "2025-09-14T22:16:22.206Z" },
+    { url = "https://files.pythonhosted.org/packages/ef/d6/8c4bd38a3b24c4c7676a7a3d8de85d6ee7a983602a734b9f9cdefb04a5d6/zstandard-0.25.0-cp310-cp310-win32.whl", hash = "sha256:106281ae350e494f4ac8a80470e66d1fe27e497052c8d9c3b95dc4cf1ade81aa", size = 436469, upload-time = "2025-09-14T22:16:25.002Z" },
+    { url = "https://files.pythonhosted.org/packages/93/90/96d50ad417a8ace5f841b3228e93d1bb13e6ad356737f42e2dde30d8bd68/zstandard-0.25.0-cp310-cp310-win_amd64.whl", hash = "sha256:ea9d54cc3d8064260114a0bbf3479fc4a98b21dffc89b3459edd506b69262f6e", size = 506100, upload-time = "2025-09-14T22:16:23.569Z" },
+    { url = "https://files.pythonhosted.org/packages/2a/83/c3ca27c363d104980f1c9cee1101cc8ba724ac8c28a033ede6aab89585b1/zstandard-0.25.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:933b65d7680ea337180733cf9e87293cc5500cc0eb3fc8769f4d3c88d724ec5c", size = 795254, upload-time = "2025-09-14T22:16:26.137Z" },
+    { url = "https://files.pythonhosted.org/packages/ac/4d/e66465c5411a7cf4866aeadc7d108081d8ceba9bc7abe6b14aa21c671ec3/zstandard-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3f79487c687b1fc69f19e487cd949bf3aae653d181dfb5fde3bf6d18894706f", size = 640559, upload-time = "2025-09-14T22:16:27.973Z" },
+    { url = "https://files.pythonhosted.org/packages/12/56/354fe655905f290d3b147b33fe946b0f27e791e4b50a5f004c802cb3eb7b/zstandard-0.25.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:0bbc9a0c65ce0eea3c34a691e3c4b6889f5f3909ba4822ab385fab9057099431", size = 5348020, upload-time = "2025-09-14T22:16:29.523Z" },
+    { url = "https://files.pythonhosted.org/packages/3b/13/2b7ed68bd85e69a2069bcc72141d378f22cae5a0f3b353a2c8f50ef30c1b/zstandard-0.25.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01582723b3ccd6939ab7b3a78622c573799d5d8737b534b86d0e06ac18dbde4a", size = 5058126, upload-time = "2025-09-14T22:16:31.811Z" },
+    { url = "https://files.pythonhosted.org/packages/c9/dd/fdaf0674f4b10d92cb120ccff58bbb6626bf8368f00ebfd2a41ba4a0dc99/zstandard-0.25.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5f1ad7bf88535edcf30038f6919abe087f606f62c00a87d7e33e7fc57cb69fcc", size = 5405390, upload-time = "2025-09-14T22:16:33.486Z" },
+    { url = "https://files.pythonhosted.org/packages/0f/67/354d1555575bc2490435f90d67ca4dd65238ff2f119f30f72d5cde09c2ad/zstandard-0.25.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:06acb75eebeedb77b69048031282737717a63e71e4ae3f77cc0c3b9508320df6", size = 5452914, upload-time = "2025-09-14T22:16:35.277Z" },
+    { url = "https://files.pythonhosted.org/packages/bb/1f/e9cfd801a3f9190bf3e759c422bbfd2247db9d7f3d54a56ecde70137791a/zstandard-0.25.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9300d02ea7c6506f00e627e287e0492a5eb0371ec1670ae852fefffa6164b072", size = 5559635, upload-time = "2025-09-14T22:16:37.141Z" },
+    { url = "https://files.pythonhosted.org/packages/21/88/5ba550f797ca953a52d708c8e4f380959e7e3280af029e38fbf47b55916e/zstandard-0.25.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfd06b1c5584b657a2892a6014c2f4c20e0db0208c159148fa78c65f7e0b0277", size = 5048277, upload-time = "2025-09-14T22:16:38.807Z" },
+    { url = "https://files.pythonhosted.org/packages/46/c0/ca3e533b4fa03112facbe7fbe7779cb1ebec215688e5df576fe5429172e0/zstandard-0.25.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f373da2c1757bb7f1acaf09369cdc1d51d84131e50d5fa9863982fd626466313", size = 5574377, upload-time = "2025-09-14T22:16:40.523Z" },
+    { url = "https://files.pythonhosted.org/packages/12/9b/3fb626390113f272abd0799fd677ea33d5fc3ec185e62e6be534493c4b60/zstandard-0.25.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c0e5a65158a7946e7a7affa6418878ef97ab66636f13353b8502d7ea03c8097", size = 4961493, upload-time = "2025-09-14T22:16:43.3Z" },
+    { url = "https://files.pythonhosted.org/packages/cb/d3/23094a6b6a4b1343b27ae68249daa17ae0651fcfec9ed4de09d14b940285/zstandard-0.25.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c8e167d5adf59476fa3e37bee730890e389410c354771a62e3c076c86f9f7778", size = 5269018, upload-time = "2025-09-14T22:16:45.292Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/a7/bb5a0c1c0f3f4b5e9d5b55198e39de91e04ba7c205cc46fcb0f95f0383c1/zstandard-0.25.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:98750a309eb2f020da61e727de7d7ba3c57c97cf6213f6f6277bb7fb42a8e065", size = 5443672, upload-time = "2025-09-14T22:16:47.076Z" },
+    { url = "https://files.pythonhosted.org/packages/27/22/503347aa08d073993f25109c36c8d9f029c7d5949198050962cb568dfa5e/zstandard-0.25.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:22a086cff1b6ceca18a8dd6096ec631e430e93a8e70a9ca5efa7561a00f826fa", size = 5822753, upload-time = "2025-09-14T22:16:49.316Z" },
+    { url = "https://files.pythonhosted.org/packages/e2/be/94267dc6ee64f0f8ba2b2ae7c7a2df934a816baaa7291db9e1aa77394c3c/zstandard-0.25.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72d35d7aa0bba323965da807a462b0966c91608ef3a48ba761678cb20ce5d8b7", size = 5366047, upload-time = "2025-09-14T22:16:51.328Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/a3/732893eab0a3a7aecff8b99052fecf9f605cf0fb5fb6d0290e36beee47a4/zstandard-0.25.0-cp311-cp311-win32.whl", hash = "sha256:f5aeea11ded7320a84dcdd62a3d95b5186834224a9e55b92ccae35d21a8b63d4", size = 436484, upload-time = "2025-09-14T22:16:55.005Z" },
+    { url = "https://files.pythonhosted.org/packages/43/a3/c6155f5c1cce691cb80dfd38627046e50af3ee9ddc5d0b45b9b063bfb8c9/zstandard-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:daab68faadb847063d0c56f361a289c4f268706b598afbf9ad113cbe5c38b6b2", size = 506183, upload-time = "2025-09-14T22:16:52.753Z" },
+    { url = "https://files.pythonhosted.org/packages/8c/3e/8945ab86a0820cc0e0cdbf38086a92868a9172020fdab8a03ac19662b0e5/zstandard-0.25.0-cp311-cp311-win_arm64.whl", hash = "sha256:22a06c5df3751bb7dc67406f5374734ccee8ed37fc5981bf1ad7041831fa1137", size = 462533, upload-time = "2025-09-14T22:16:53.878Z" },
     { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738, upload-time = "2025-09-14T22:16:56.237Z" },
     { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436, upload-time = "2025-09-14T22:16:57.774Z" },
     { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019, upload-time = "2025-09-14T22:16:59.302Z" },