-
Notifications
You must be signed in to change notification settings - Fork 403
Megatron-Bridge import example in launcher for Nemotron Super V3 #1516
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,6 @@ | ||
| [submodule "tools/launcher/modules/Megatron-LM"] | ||
| path = tools/launcher/modules/Megatron-LM | ||
| url = https://github.com/NVIDIA/Megatron-LM.git | ||
| [submodule "tools/launcher/modules/Megatron-Bridge"] | ||
| path = tools/launcher/modules/Megatron-Bridge | ||
| url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,74 @@ | ||||||||||||||||||
| #!/bin/bash | ||||||||||||||||||
|
|
||||||||||||||||||
| # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||||||||||||||||||
| # SPDX-License-Identifier: Apache-2.0 | ||||||||||||||||||
| # | ||||||||||||||||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||||||||||||||||
| # you may not use this file except in compliance with the License. | ||||||||||||||||||
| # You may obtain a copy of the License at | ||||||||||||||||||
| # | ||||||||||||||||||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||||||||||||||||||
| # | ||||||||||||||||||
| # Unless required by applicable law or agreed to in writing, software | ||||||||||||||||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||||||||||||||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||||||||||||||||
| # See the License for the specific language governing permissions and | ||||||||||||||||||
| # limitations under the License. | ||||||||||||||||||
|
|
||||||||||||||||||
| # Megatron-Bridge HF -> Megatron checkpoint import (CPU-capable). | ||||||||||||||||||
| # | ||||||||||||||||||
| # Required env: HF_MODEL_ID (e.g. nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16) | ||||||||||||||||||
| # Optional env: | ||||||||||||||||||
| # OUTPUT_DIR Parent dir for the MCore checkpoint (default: cwd). | ||||||||||||||||||
| # TORCH_DTYPE Model dtype for HF load (default: bfloat16). | ||||||||||||||||||
| # | ||||||||||||||||||
| # Writes MCore checkpoint to ${OUTPUT_DIR}/<basename(HF_MODEL_ID)>-MCore | ||||||||||||||||||
| # | ||||||||||||||||||
| # Runs: | ||||||||||||||||||
| # python examples/conversion/convert_checkpoints.py import \ | ||||||||||||||||||
| # --hf-model $HF_MODEL_ID \ | ||||||||||||||||||
| # --megatron-path $OUTPUT_DIR/<model>-MCore \ | ||||||||||||||||||
| # --torch-dtype $TORCH_DTYPE | ||||||||||||||||||
|
|
||||||||||||||||||
| set -e | ||||||||||||||||||
|
|
||||||||||||||||||
| if [[ -z "${HF_MODEL_ID}" ]]; then | ||||||||||||||||||
| echo "[ERROR] HF_MODEL_ID is required" >&2 | ||||||||||||||||||
| exit 1 | ||||||||||||||||||
| fi | ||||||||||||||||||
|
|
||||||||||||||||||
| SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" | ||||||||||||||||||
| LAUNCHER_DIR="${SCRIPT_DIR}/../../.." | ||||||||||||||||||
| BRIDGE_DIR="${LAUNCHER_DIR}/modules/Megatron-Bridge" | ||||||||||||||||||
| MLM_DIR="${LAUNCHER_DIR}/modules/Megatron-LM" | ||||||||||||||||||
|
|
||||||||||||||||||
| if ! python -c "import megatron.bridge" 2>/dev/null; then | ||||||||||||||||||
| echo "[INFO] Installing megatron-bridge from ${BRIDGE_DIR}" | ||||||||||||||||||
| unset PIP_CONSTRAINT | ||||||||||||||||||
| pip install -e "${BRIDGE_DIR}" | ||||||||||||||||||
| fi | ||||||||||||||||||
|
Comment on lines
+45
to
+49
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Force local Megatron-Bridge resolution to avoid version drift.
Proposed fix-if ! python -c "import megatron.bridge" 2>/dev/null; then
- echo "[INFO] Installing megatron-bridge from ${BRIDGE_DIR}"
- unset PIP_CONSTRAINT
- pip install -e "${BRIDGE_DIR}"
-fi
+echo "[INFO] Installing megatron-bridge from ${BRIDGE_DIR}"
+unset PIP_CONSTRAINT
+pip install -e "${BRIDGE_DIR}"📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||
|
|
||||||||||||||||||
| if [[ -n "${EXTRA_PIP_DEPS}" ]]; then | ||||||||||||||||||
| echo "[INFO] Installing extra deps: ${EXTRA_PIP_DEPS}" | ||||||||||||||||||
| unset PIP_CONSTRAINT | ||||||||||||||||||
| read -r -a _deps <<< "${EXTRA_PIP_DEPS}" | ||||||||||||||||||
| # --no-build-isolation: mamba-ssm/causal-conv1d need torch visible at build time. | ||||||||||||||||||
| pip install --no-build-isolation "${_deps[@]}" | ||||||||||||||||||
| fi | ||||||||||||||||||
|
|
||||||||||||||||||
| # Megatron-Bridge needs newer megatron.core (incl. megatron.core.distributed.fsdp). | ||||||||||||||||||
| # Prepend local Megatron-LM to PYTHONPATH so its sources shadow installed megatron-core. | ||||||||||||||||||
| export PYTHONPATH="${MLM_DIR}:${PYTHONPATH}" | ||||||||||||||||||
|
|
||||||||||||||||||
| OUTPUT_DIR="${OUTPUT_DIR:-$(pwd)}" | ||||||||||||||||||
| MODEL_NAME="$(basename "${HF_MODEL_ID}")" | ||||||||||||||||||
| MEGATRON_PATH="${OUTPUT_DIR}/${MODEL_NAME}-MCore" | ||||||||||||||||||
| TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}" | ||||||||||||||||||
|
|
||||||||||||||||||
| mkdir -p "${OUTPUT_DIR}" | ||||||||||||||||||
|
|
||||||||||||||||||
| cd "${BRIDGE_DIR}" | ||||||||||||||||||
| exec python examples/conversion/convert_checkpoints.py import \ | ||||||||||||||||||
| --hf-model "${HF_MODEL_ID}" \ | ||||||||||||||||||
| --megatron-path "${MEGATRON_PATH}" \ | ||||||||||||||||||
| --torch-dtype "${TORCH_DTYPE}" | ||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,34 @@ | ||
| # Megatron-Bridge import for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16. | ||
| # | ||
| # Imports HF weights to a Megatron-LM checkpoint via AutoBridge.import_ckpt | ||
| # (use_cpu_initialization=True). Uses a single 8xH100 Slurm node — Megatron-Bridge | ||
| # requires at least 1 GPU for nccl init even with CPU-resident weights. | ||
| # | ||
| # Usage: | ||
| # export SLURM_HOST=<slurm-host> | ||
| # export SLURM_ACCOUNT=<your-team> | ||
| # export SLURM_PARTITION=<gpu-partition> # default: batch | ||
| # export SLURM_JOB_DIR=/home/scratch.<user>/experiments | ||
| # export HF_TOKEN=<your-hf-token> # gated model | ||
| # cd tools/launcher | ||
| # uv run launch.py --yaml examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml --yes | ||
|
|
||
| job_name: Nemotron-3-Super-120B_bridge_import | ||
| pipeline: | ||
| skip: false | ||
| allow_to_fail: false | ||
| note: "HF -> MCore import via Megatron-Bridge (8xH100)" | ||
|
|
||
| task_0: | ||
| script: common/megatron_bridge/import/import.sh | ||
| environment: | ||
| - HF_MODEL_ID: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 | ||
| - OUTPUT_DIR: /scratchspace/megatron-bridge | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Replace hardcoded Hardcoding As per coding guidelines, 🤖 Prompt for AI Agents |
||
| - EXTRA_PIP_DEPS: "mamba-ssm causal-conv1d" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if we use nemo container, then we wont need these extra deps |
||
| slurm_config: | ||
| _factory_: "slurm_factory" | ||
| partition: batch | ||
| nodes: 1 | ||
| ntasks_per_node: 1 | ||
| gpus_per_node: 8 | ||
| time: "04:00:00" | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why does this script need to be so complicated and why can we just run
/opt/Megatron-Bridge/examples/conversion/convert_checkpoints.pydirectly? Can we assume this script is run in nemo:26.02 or later container hence all required dependencies already present?