From e5c6b67ebf43639228d12aad0567065a388c2896 Mon Sep 17 00:00:00 2001 From: Jennifer Chen Date: Mon, 18 May 2026 13:24:53 -0700 Subject: [PATCH] megatron-bridge import example in launcher Signed-off-by: Jennifer Chen --- .gitmodules | 3 + .../common/megatron_bridge/import/import.sh | 74 +++++++++++++++++++ .../megatron_bridge_import.yaml | 34 +++++++++ tools/launcher/launch.py | 6 +- tools/launcher/modules/Megatron-Bridge | 1 + tools/launcher/modules/Megatron-LM | 2 +- 6 files changed, 118 insertions(+), 2 deletions(-) create mode 100755 tools/launcher/common/megatron_bridge/import/import.sh create mode 100644 tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml create mode 160000 tools/launcher/modules/Megatron-Bridge diff --git a/.gitmodules b/.gitmodules index 9043516bbfd..57d9d48b6f9 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,6 @@ [submodule "tools/launcher/modules/Megatron-LM"] path = tools/launcher/modules/Megatron-LM url = https://github.com/NVIDIA/Megatron-LM.git +[submodule "tools/launcher/modules/Megatron-Bridge"] + path = tools/launcher/modules/Megatron-Bridge + url = https://github.com/NVIDIA-NeMo/Megatron-Bridge.git diff --git a/tools/launcher/common/megatron_bridge/import/import.sh b/tools/launcher/common/megatron_bridge/import/import.sh new file mode 100755 index 00000000000..03446873978 --- /dev/null +++ b/tools/launcher/common/megatron_bridge/import/import.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Megatron-Bridge HF -> Megatron checkpoint import (CPU-capable). +# +# Required env: HF_MODEL_ID (e.g. nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16) +# Optional env: +# OUTPUT_DIR Parent dir for the MCore checkpoint (default: cwd). +# TORCH_DTYPE Model dtype for HF load (default: bfloat16). +# +# Writes MCore checkpoint to ${OUTPUT_DIR}/-MCore +# +# Runs: +# python examples/conversion/convert_checkpoints.py import \ +# --hf-model $HF_MODEL_ID \ +# --megatron-path $OUTPUT_DIR/-MCore \ +# --torch-dtype $TORCH_DTYPE + +set -e + +if [[ -z "${HF_MODEL_ID}" ]]; then + echo "[ERROR] HF_MODEL_ID is required" >&2 + exit 1 +fi + +SCRIPT_DIR="$(dirname "$(readlink -f "$0")")" +LAUNCHER_DIR="${SCRIPT_DIR}/../../.." +BRIDGE_DIR="${LAUNCHER_DIR}/modules/Megatron-Bridge" +MLM_DIR="${LAUNCHER_DIR}/modules/Megatron-LM" + +if ! python -c "import megatron.bridge" 2>/dev/null; then + echo "[INFO] Installing megatron-bridge from ${BRIDGE_DIR}" + unset PIP_CONSTRAINT + pip install -e "${BRIDGE_DIR}" +fi + +if [[ -n "${EXTRA_PIP_DEPS}" ]]; then + echo "[INFO] Installing extra deps: ${EXTRA_PIP_DEPS}" + unset PIP_CONSTRAINT + read -r -a _deps <<< "${EXTRA_PIP_DEPS}" + # --no-build-isolation: mamba-ssm/causal-conv1d need torch visible at build time. + pip install --no-build-isolation "${_deps[@]}" +fi + +# Megatron-Bridge needs newer megatron.core (incl. megatron.core.distributed.fsdp). +# Prepend local Megatron-LM to PYTHONPATH so its sources shadow installed megatron-core. +export PYTHONPATH="${MLM_DIR}:${PYTHONPATH}" + +OUTPUT_DIR="${OUTPUT_DIR:-$(pwd)}" +MODEL_NAME="$(basename "${HF_MODEL_ID}")" +MEGATRON_PATH="${OUTPUT_DIR}/${MODEL_NAME}-MCore" +TORCH_DTYPE="${TORCH_DTYPE:-bfloat16}" + +mkdir -p "${OUTPUT_DIR}" + +cd "${BRIDGE_DIR}" +exec python examples/conversion/convert_checkpoints.py import \ + --hf-model "${HF_MODEL_ID}" \ + --megatron-path "${MEGATRON_PATH}" \ + --torch-dtype "${TORCH_DTYPE}" diff --git a/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml new file mode 100644 index 00000000000..12e0db9d3f5 --- /dev/null +++ b/tools/launcher/examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml @@ -0,0 +1,34 @@ +# Megatron-Bridge import for nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16. +# +# Imports HF weights to a Megatron-LM checkpoint via AutoBridge.import_ckpt +# (use_cpu_initialization=True). Uses a single 8xH100 Slurm node — Megatron-Bridge +# requires at least 1 GPU for nccl init even with CPU-resident weights. +# +# Usage: +# export SLURM_HOST= +# export SLURM_ACCOUNT= +# export SLURM_PARTITION= # default: batch +# export SLURM_JOB_DIR=/home/scratch./experiments +# export HF_TOKEN= # gated model +# cd tools/launcher +# uv run launch.py --yaml examples/nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16/megatron_bridge_import.yaml --yes + +job_name: Nemotron-3-Super-120B_bridge_import +pipeline: + skip: false + allow_to_fail: false + note: "HF -> MCore import via Megatron-Bridge (8xH100)" + + task_0: + script: common/megatron_bridge/import/import.sh + environment: + - HF_MODEL_ID: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 + - OUTPUT_DIR: /scratchspace/megatron-bridge + - EXTRA_PIP_DEPS: "mamba-ssm causal-conv1d" + slurm_config: + _factory_: "slurm_factory" + partition: batch + nodes: 1 + ntasks_per_node: 1 + gpus_per_node: 8 + time: "04:00:00" diff --git a/tools/launcher/launch.py b/tools/launcher/launch.py index fdb867f08aa..2561dc0feb5 100644 --- a/tools/launcher/launch.py +++ b/tools/launcher/launch.py @@ -61,13 +61,17 @@ "modules/Megatron-LM/megatron/*", "modules/Megatron-LM/examples/*", "modules/Megatron-LM/*.py", + "modules/Megatron-Bridge/src/*", + "modules/Megatron-Bridge/examples/*", + "modules/Megatron-Bridge/pyproject.toml", + "modules/Megatron-Bridge/README.md", "modules/Model-Optimizer/modelopt/*", "modules/Model-Optimizer/modelopt_recipes/*", "modules/Model-Optimizer/examples/*", "examples/*", "common/*", ], - relative_path=[LAUNCHER_DIR] * 8, + relative_path=[LAUNCHER_DIR] * 12, ) MODELOPT_SRC_PATH = os.path.join(LAUNCHER_DIR, "modules/Model-Optimizer/modelopt") diff --git a/tools/launcher/modules/Megatron-Bridge b/tools/launcher/modules/Megatron-Bridge new file mode 160000 index 00000000000..6f24c71f395 --- /dev/null +++ b/tools/launcher/modules/Megatron-Bridge @@ -0,0 +1 @@ +Subproject commit 6f24c71f3957054a2261d028217640cea3d5e91c diff --git a/tools/launcher/modules/Megatron-LM b/tools/launcher/modules/Megatron-LM index 35d5c653e38..86bf4765938 160000 --- a/tools/launcher/modules/Megatron-LM +++ b/tools/launcher/modules/Megatron-LM @@ -1 +1 @@ -Subproject commit 35d5c653e38a0b5b3772627a7454b059c7bca932 +Subproject commit 86bf47659387383b99bb345cc4f2c090c73100b0