diff --git a/docs/api/common.rst b/docs/api/common.rst index 3ddb66df98..ee00415c7f 100644 --- a/docs/api/common.rst +++ b/docs/api/common.rst @@ -14,6 +14,8 @@ Common API .. autoapiclass:: transformer_engine.common.recipe.NVFP4BlockScaling(fp4_format=Format.E2M1) +.. autoapiclass:: transformer_engine.common.recipe.NVFP4PerTokenBlockScaling(fp4_format=Format.E2M1) + .. autoapiclass:: transformer_engine.common.recipe.Float8CurrentScaling(fp8_format=Format.HYBRID) .. autoapiclass:: transformer_engine.common.recipe.Float8BlockScaling(fp8_format=Format.E4M3) diff --git a/docs/envvars.rst b/docs/envvars.rst index bd62ccac46..45b31d6b82 100644 --- a/docs/envvars.rst +++ b/docs/envvars.rst @@ -287,6 +287,48 @@ Kernel Configuration :Default: ``0`` :Description: Enable row-scaled NVFP4 tensors for forward activation quantizers in the ``NVFP4BlockScaling`` recipe. When set to ``1`` (or when ``NVFP4BlockScaling(row_scaled_activation=True)`` is used), rowwise ``amax`` metadata is stored as one FP32 value per tensor row instead of a single scalar. +.. envvar:: NVTE_NVFP4_DISABLE_RHT + + :Type: ``int`` (0 or 1) + :Default: ``0`` + :Description: Opt out of the random Hadamard transform (RHT) in the per-tensor ``NVFP4BlockScaling`` recipe. RHT is applied by default to the forward activation and backward gradient quantizers. Set to ``1`` (or use ``NVFP4BlockScaling(disable_rht=True)``) to disable it. No effect on the per-token path (see :envvar:`NVTE_NVFP4_PER_TOKEN_RHT`). + +.. envvar:: NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING + + :Type: ``int`` (0 or 1) + :Default: ``0`` + :Description: Opt out of stochastic rounding (SR) in the per-tensor ``NVFP4BlockScaling`` recipe. SR is applied by default to the backward gradient quantizer. Set to ``1`` (or use ``NVFP4BlockScaling(disable_stochastic_rounding=True)``) to disable it. No effect on the per-token path (see :envvar:`NVTE_NVFP4_PER_TOKEN_SR`). + +.. envvar:: NVTE_NVFP4_DISABLE_2D_QUANTIZATION + + :Type: ``int`` (0 or 1) + :Default: ``0`` + :Description: Opt out of 2D (16x16 inner tile + scalar outer amax) weight quantization in the per-tensor ``NVFP4BlockScaling`` recipe. 2D weight quantization is enabled by default. Set to ``1`` (or use ``NVFP4BlockScaling(disable_2d_quantization=True)``) to fall back to 1D (16-element block) weight quantization. Forced on the per-token path (the per-token cast hard-disables 2D); see :envvar:`NVTE_NVFP4_PER_TOKEN_WEIGHT_2D` for the per-token weight-2D route. + +.. envvar:: NVTE_NVFP4_PER_TOKEN + + :Type: ``int`` (0 or 1) + :Default: ``0`` + :Description: Flip a plain ``NVFP4BlockScaling`` recipe into per-token mode (per-row / per-col outer ``amax`` cast plus the fused-EVT CUTLASS GEMM) without changing the recipe class. This lets frameworks that already construct a default ``NVFP4BlockScaling`` (e.g. Megatron-Core with ``--fp4-format e2m1``) opt into per-token purely from the launch environment. Equivalent to constructing the explicit ``NVFP4PerTokenBlockScaling`` recipe. The per-token forward path currently requires the unfused norm+amax path: also set ``NVTE_NORM_FWD_USE_CUDNN=1`` (the fused norm+amax path rejects per-token quantizers). + +.. envvar:: NVTE_NVFP4_PER_TOKEN_RHT + + :Type: ``int`` (0 or 1) + :Default: ``0`` + :Description: Per-token only. Opt into the random Hadamard transform (RHT) on the per-token forward activation and backward gradient quantizers. Per-token disables RHT by default (its per-row outer amax already mitigates the long-tail outliers RHT targets); set to ``1`` (or use ``NVFP4PerTokenBlockScaling(per_token_rht=True)``) to re-enable it. No effect on the per-tensor path. + +.. envvar:: NVTE_NVFP4_PER_TOKEN_SR + + :Type: ``int`` (0 or 1) + :Default: ``0`` + :Description: Per-token only. Opt into stochastic rounding (SR) on the per-token backward gradient quantizer (the K2 encode kernel implements a Philox-dithered FP4 cast). Per-token disables SR by default; set to ``1`` (or use ``NVFP4PerTokenBlockScaling(per_token_sr=True)``) to re-enable it. No effect on the per-tensor path. + +.. envvar:: NVTE_NVFP4_PER_TOKEN_WEIGHT_2D + + :Type: ``int`` (0 or 1) + :Default: ``0`` + :Description: Per-token only. Quantize the forward weight with the per-tensor 2D cast (16x16 inner tile + scalar outer amax) emitted in per-token layout, instead of the per-token 1D weight cast. 2D weight quantization is transposition-invariant, so forward (rowwise) and dgrad (columnwise) see the same weight, removing the 1D path's weight-gradient bias. Activations and gradients stay on the standard per-token 1D cast. Set to ``1`` (or use ``NVFP4PerTokenBlockScaling(per_token_weight_2d=True)``). No effect on the per-tensor path. + .. envvar:: NVTE_NVFP4_4OVER6 :Type: ``str`` (``none``, ``weights``, ``activations``, or ``all``) diff --git a/docs/features/low_precision_training/nvfp4/nvfp4.rst b/docs/features/low_precision_training/nvfp4/nvfp4.rst index 0415963a71..e449a6079c 100644 --- a/docs/features/low_precision_training/nvfp4/nvfp4.rst +++ b/docs/features/low_precision_training/nvfp4/nvfp4.rst @@ -207,6 +207,89 @@ NVFP4 all-gather is supported. *Figure 6. Quantization and all-gather flow for NVFP4 showing amax synchronization and hierarchical scaling.* +Per-token NVFP4 +--------------- + +The default ``NVFP4BlockScaling`` recipe computes a single per-tensor outer +``amax`` (``s_global``) for each tensor. The **per-token** variant instead +computes a per-row outer ``amax`` (length ``M``) for rowwise data and a per-col +outer ``amax`` (length ``K``) for columnwise data, giving each token/row its own +global scale. This finer outer-scale granularity can improve accuracy, and the +per-token cast feeds a dedicated fused-EVT CUTLASS GEMM that consumes the vector +outer ``amax`` directly (cuBLASLt cannot). + +There are two ways to select per-token, both equivalent: + +* **Explicit recipe class** ``NVFP4PerTokenBlockScaling`` (recommended for code + that constructs its own recipe). +* **Environment variable** ``NVTE_NVFP4_PER_TOKEN=1`` on a plain + ``NVFP4BlockScaling``. This lets frameworks that only ever build a default + ``NVFP4BlockScaling`` (for example Megatron-Core) opt into per-token purely + from the launch environment, with no framework-side code change. + +.. code-block:: python + + from transformer_engine.common.recipe import NVFP4PerTokenBlockScaling + import transformer_engine.pytorch as te + + # RHT and SR are OFF by default on the per-token path; opt in as needed. + recipe = NVFP4PerTokenBlockScaling(per_token_rht=True, per_token_sr=True) + with te.fp8_autocast(enabled=True, fp8_recipe=recipe): + out = model(inp) + +**Differences from the per-tensor default** + +* RHT and stochastic rounding are **off by default** on the per-token path (the + per-row outer ``amax`` already mitigates the long-tail outliers RHT targets). + Opt in with ``per_token_rht=True`` / ``per_token_sr=True`` (env vars + :envvar:`NVTE_NVFP4_PER_TOKEN_RHT` / :envvar:`NVTE_NVFP4_PER_TOKEN_SR`). +* 2D weight quantization is disabled by default. The per-token weight-2D route + (``per_token_weight_2d=True`` / :envvar:`NVTE_NVFP4_PER_TOKEN_WEIGHT_2D`) + quantizes the forward weight with the transposition-invariant 2D cast emitted + in per-token layout, removing the 1D weight-gradient bias. +* ``row_scaled_activation`` and 4over6 are forced off (mutually exclusive with + the per-token amax layout). + +**Requirement: unfused norm forward.** The per-token forward path requires the +unfused norm+amax implementation; the fused norm+amax path rejects per-token +quantizers. When the first GEMM consumes a fused norm output (for example +``LayerNormLinear``), also set ``NVTE_NORM_FWD_USE_CUDNN=1``. + +**Currently unsupported on the per-token path**: ``fuse_wgrad_accumulation=True``, +forward/backward output quantization, and communication/bulk overlap. + +Running per-token NVFP4 with Megatron-Core +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Megatron-Core builds a plain ``NVFP4BlockScaling`` for ``--fp4-format e2m1`` and +has no CLI for per-token, so per-token is selected entirely through TE +environment variables. A minimal launch looks like: + +.. code-block:: bash + + # Select NVFP4 per-token via TE env vars (read at recipe construction). + export NVTE_NVFP4_PER_TOKEN=1 + export NVTE_NORM_FWD_USE_CUDNN=1 # required: unfused norm forward + # Optional per-token knobs: + # export NVTE_NVFP4_PER_TOKEN_RHT=1 + # export NVTE_NVFP4_PER_TOKEN_SR=1 + # export NVTE_NVFP4_PER_TOKEN_WEIGHT_2D=1 + + python pretrain_gpt.py \ + --transformer-impl transformer_engine \ + --fp4-format e2m1 \ + --no-gradient-accumulation-fusion \ + ... # remaining model / data / optimizer args + +Notes: + +* ``--no-gradient-accumulation-fusion`` is required because the per-token kernel + does not yet support fused wgrad accumulation. +* To keep the first/last transformer layers in BF16, use Megatron's + ``--first-last-layers-bf16 --num-layers-at-start-in-bf16 N + --num-layers-at-end-in-bf16 M`` flags (those layers simply skip the FP4 + autocast; the recipe is unchanged). + Examples -------- diff --git a/examples/pytorch/nvfp4_per_token_megatron/README.md b/examples/pytorch/nvfp4_per_token_megatron/README.md new file mode 100644 index 0000000000..760c2c1a11 --- /dev/null +++ b/examples/pytorch/nvfp4_per_token_megatron/README.md @@ -0,0 +1,140 @@ +# NVFP4 per-token training with Megatron-Core + +This example shows how to train a small Mixture-of-Experts (MoE) model with +the **NVFP4 per-token** quantization recipe on a single GPU using +[Megatron-Core](https://github.com/NVIDIA/Megatron-LM), and how to compare it +against the per-tensor NVFP4 recipe and an unquantized BF16 baseline. + +The same model / data / seed are used across all modes; only the GEMM precision +changes, so the runs are directly comparable. + +## How per-token interacts with Megatron-Core + +Megatron-Core builds a plain `transformer_engine.common.recipe.NVFP4BlockScaling` +for `--fp4-format e2m1` and has **no CLI flag for per-token**. Per-token is +selected entirely through Transformer Engine environment variables, read when the +recipe is constructed: + +| Variable | Effect | +| --- | --- | +| `NVTE_NVFP4_PER_TOKEN=1` | **Required**: Flip the recipe into per-token mode (per-row/per-col outer amax + fused CUTLASS GEMM) | +| `NVTE_NORM_FWD_USE_CUDNN=1` | **Required** with per-token: forces the unfused norm forward (the fused norm+amax path rejects per-token currently) | +| `NVTE_NVFP4_PER_TOKEN_RHT=1` | Opt into the random Hadamard transform (off by default) | +| `NVTE_NVFP4_PER_TOKEN_SR=1` | Opt into stochastic rounding (off by default) | +| `NVTE_NVFP4_PER_TOKEN_WEIGHT_2D=1` | Use the transposition-invariant 2D weight cast in per-token layout | + +For the per-tensor recipe, the analogous knobs are +`NVTE_NVFP4_DISABLE_RHT`, `NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING`, and +`NVTE_NVFP4_DISABLE_2D_QUANTIZATION`. + +See the +[NVFP4 documentation](../../../docs/features/low_precision_training/nvfp4/nvfp4.rst) +("Per-token NVFP4") and `docs/envvars.rst` for full details. Equivalently, code +that constructs its own recipe can use the public +`transformer_engine.common.recipe.NVFP4PerTokenBlockScaling` class instead of the +env var. + +Keeping the first/last transformer layers in BF16 is a Megatron-Core CLI feature +(`--first-last-layers-bf16 --num-layers-at-start-in-bf16 N +--num-layers-at-end-in-bf16 M`); those layers simply skip the FP4 autocast. This +is also supported with the per-token recipe. + +## Prerequisites + +- A Blackwell GPU (SM100+) — NVFP4 training requires it. +- Transformer Engine built from this repository **with per-token support** + (`NVTE_CUDA_ARCHS=100a NVTE_BUILD_THREADS_PER_JOB=8 NVTE_FRAMEWORK=pytorch pip install -e . --no-build-isolation`). +- A [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) checkout (provides + `pretrain_gpt.py` and the `megatron` package). +- A tokenized dataset and tokenizer (the scripts default to an OLMo-1124 corpus + with the Moonlight-16B-A3B tokenizer). +- For Weights & Biases logging: authenticate with `wandb login` or export + `WANDB_API_KEY` in your environment. + +## Files + +| File | Purpose | +| --- | --- | +| `run_moe_nvfp4_singlegpu.sh` | Core launcher. Run **inside** the container from a shell that can see `pretrain_gpt.py`. Takes one mode: `bf16`, `prod` (== `pertensor`), or `pertoken`. | +| `sbatch_moe_nvfp4_singlegpu.sh` | Slurm wrapper: starts the container, (re)installs the editable TE build, then runs one or more variants (so far one GPU each). | +| `submit_chain.sh` | Submit a chain of dependent Slurm jobs that auto-resume from the stable checkpoint dir. | + +## Quick start (standalone, inside a container) + +```bash +# Point at your Megatron-LM checkout, data, and tokenizer. +export MLM_DIR=/path/to/Megatron-LM +export DATA_PATH=/path/to/datasets/your_data +export TOKENIZER_MODEL=/path/to/tokenizers/Moonlight-16B-A3B +export TRAIN_ITERS=2000 + +bash run_moe_nvfp4_singlegpu.sh pertoken + +# Compare against per-tensor NVFP4 and BF16: +bash run_moe_nvfp4_singlegpu.sh pertensor +bash run_moe_nvfp4_singlegpu.sh bf16 +``` + +To enable per-token RHT / SR / 2D-weight, export the knobs before launching: + +```bash +export NVTE_NVFP4_PER_TOKEN_RHT=1 +export NVTE_NVFP4_PER_TOKEN_SR=1 +export NVTE_NVFP4_PER_TOKEN_WEIGHT_2D=1 +bash run_moe_nvfp4_singlegpu.sh pertoken +``` + +## Slurm + +Edit the **host-side config block** at the top of +`sbatch_moe_nvfp4_singlegpu.sh` (Slurm account, container `IMAGE`, `HOST_MOUNT`, +`TE_DIR`, and `HOST_LOG_DIR` / the `#SBATCH --output/--error` paths) for your +cluster, then: + +```bash +# One mode: +sbatch sbatch_moe_nvfp4_singlegpu.sh pertoken + +# Up to 4 variants concurrently (one GPU each): +sbatch sbatch_moe_nvfp4_singlegpu.sh "bf16,pertensor+rht+sr,pertoken" + +# Override knobs via --export: +sbatch --export=ALL,TRAIN_ITERS=2000,SEED=42 sbatch_moe_nvfp4_singlegpu.sh pertoken +``` + +Spec syntax: `[+rht][+sr][+1d][+2d][+fb]` where `mode` is +`bf16 | prod (== pertensor) | pertoken`. `+rht`/`+sr` turn those features on, +`+1d` forces 1D weights (per-tensor only), `+2d` enables the per-token 2D-weight +route, and `+fb` keeps the first/last layers in BF16. + +For runs that exceed one Slurm wall-clock window, chain dependent jobs that +resume from the stable per-variant checkpoint dir: + +```bash +CHAIN=3 bash submit_chain.sh \ + --export=ALL,IMAGE=/path/to/te_pertoken.sqsh,SKIP_BUILD=1,TRAIN_ITERS=60000 \ + sbatch_moe_nvfp4_singlegpu.sh pertoken +``` + +## Notes and current limitations + +The per-token recipe is currently intended for **accuracy evaluation and +comparison** (per-token vs per-tensor vs BF16), **not** for optimized production +deployment. Concretely: + +- **Requires `NVTE_NORM_FWD_USE_CUDNN=1`** (the unfused cuDNN norm forward). + The fused norm+amax path (`NVTE_NORM_FWD_USE_CUDNN=0`, the default) does **not** + support per-token and is rejected at the C++ quantizer. The launcher sets this + for you in `pertoken` mode. +- **Not tested with CUDA graphs.** The per-token path has not been validated under + Megatron's CUDA graph capture; leave CUDA graphs disabled for now. +- **Kernels are not yet performance-optimal.** Several per-token cast / GEMM + kernels are functional but not tuned, so wall-clock throughput is not + representative of the recipe's eventual performance. Use this example for + numerical/accuracy comparison, not perf benchmarking. +- `--no-gradient-accumulation-fusion` is required: the per-token kernel does not + yet support fused wgrad accumulation. The scripts set it for every mode so + only the GEMM precision differs. +- The example reduces the MoE expert count to 64 so all experts stay local at + EP=1 on a single GPU (TE's grouped-NVFP4 kernels cap at 64 tensors per launch). + Real training shards experts via EP>1. diff --git a/examples/pytorch/nvfp4_per_token_megatron/run_moe_nvfp4_singlegpu.sh b/examples/pytorch/nvfp4_per_token_megatron/run_moe_nvfp4_singlegpu.sh new file mode 100755 index 0000000000..bf6860cf62 --- /dev/null +++ b/examples/pytorch/nvfp4_per_token_megatron/run_moe_nvfp4_singlegpu.sh @@ -0,0 +1,321 @@ +#!/bin/bash + +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +# ============================================================================ +# Single-GPU Megatron-Core *MoE* NVFP4 smoke run (bf16 vs prod vs per-token). + +# Usage (run INSIDE the compute-node container, from the Megatron-LM root): +# bash run_moe_nvfp4_singlegpu.sh bf16 # unquantized bf16 baseline +# bash run_moe_nvfp4_singlegpu.sh prod # production NVFP4 (per-tensor block-scaling) +# bash run_moe_nvfp4_singlegpu.sh pertoken # NVFP4 per-token recipe +# +# Identical model / data / seed across modes; only the GEMM precision changes due to different quant recipe. +# ============================================================================ +set -euo pipefail + +MODE="${1:-prod}" +[[ "$MODE" == "pertensor" ]] && MODE="prod" # alias: pertensor == prod (NVFP4 per-tensor) + +# --------------------------------------------------------------------------- +# 0. Per-container hygiene: drop image-baked flash-attn (TE c10 ABI mismatch). +# --------------------------------------------------------------------------- +pip uninstall -y flash-attn flash_attn flash_attn_2_cuda >/dev/null 2>&1 || true + +# --------------------------------------------------------------------------- +# 1. Recipe selection. +# --------------------------------------------------------------------------- +if [[ "$MODE" == "pertoken" ]]; then + export NVTE_NVFP4_PER_TOKEN=1 + export NVTE_NORM_FWD_USE_CUDNN=1 + echo "[run] MODE=pertoken -> NVTE_NVFP4_PER_TOKEN=1 NVTE_NORM_FWD_USE_CUDNN=1" +elif [[ "$MODE" == "prod" ]]; then + unset NVTE_NVFP4_PER_TOKEN || true + export NVTE_NVFP4_DISABLE_RHT="${NVTE_NVFP4_DISABLE_RHT:-1}" + export NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING="${NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING:-1}" + # 2D weight quant is ON by default for prod; +1d sets DISABLE_2D=1 (1D weights) + # for the per-tensor 1D-vs-2D weight-quant ablation. + export NVTE_NVFP4_DISABLE_2D_QUANTIZATION="${NVTE_NVFP4_DISABLE_2D_QUANTIZATION:-0}" + _rht=$([[ "$NVTE_NVFP4_DISABLE_RHT" == "0" ]] && echo on || echo off) + _sr=$([[ "$NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING" == "0" ]] && echo on || echo off) + _2d=$([[ "$NVTE_NVFP4_DISABLE_2D_QUANTIZATION" == "0" ]] && echo 2D || echo 1D) + echo "[run] MODE=prod -> NVFP4 per-tensor block-scaling (RHT=$_rht SR=$_sr, weight=$_2d)" +elif [[ "$MODE" == "bf16" ]]; then + unset NVTE_NVFP4_PER_TOKEN || true + echo "[run] MODE=bf16 -> unquantized bf16 baseline" +else + echo "[run] ERROR: unknown MODE '$MODE' (expected 'bf16', 'prod' or 'pertoken')" >&2 + exit 1 +fi + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Locate the Megatron-LM root (where pretrain_gpt.py + the megatron package live) +# so this script works whether it sits inside Megatron-LM/ or one level up next +# to a Megatron-LM/ checkout (override with MLM_DIR=/path/to/Megatron-LM). +if [[ -n "${MLM_DIR:-}" ]]; then + : +elif [[ -f "${SCRIPT_DIR}/pretrain_gpt.py" ]]; then + MLM_DIR="$SCRIPT_DIR" +elif [[ -f "${SCRIPT_DIR}/Megatron-LM/pretrain_gpt.py" ]]; then + MLM_DIR="${SCRIPT_DIR}/Megatron-LM" +else + echo "[run] ERROR: cannot find pretrain_gpt.py under '$SCRIPT_DIR' or '$SCRIPT_DIR/Megatron-LM'." >&2 + echo "[run] Set MLM_DIR=/path/to/Megatron-LM and re-run." >&2 + exit 1 +fi +echo "[run] Megatron-LM root: $MLM_DIR" +cd "$MLM_DIR" +export PYTHONPATH="${MLM_DIR}:${PYTHONPATH:-}" +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NVTE_ALLOW_NONDETERMINISTIC_ALGO=1 +export NCCL_NVLS_ENABLE=0 + +WANDB_PROJECT="${WANDB_PROJECT:-NVFP4 per-token recipe MoE}" +if [[ "$MODE" == "pertoken" ]]; then + WANDB_EXP_NAME="nvfp4-pertoken" + # Append the active per-token recipe knobs so each variant gets its OWN + # work dir + wandb exp name (e.g. nvfp4-pertoken-rht-sr). Keep rht before sr. + [[ "${NVTE_NVFP4_PER_TOKEN_RHT:-0}" == "1" ]] && WANDB_EXP_NAME+="-rht" + [[ "${NVTE_NVFP4_PER_TOKEN_SR:-0}" == "1" ]] && WANDB_EXP_NAME+="-sr" + [[ "${NVTE_NVFP4_PER_TOKEN_WEIGHT_2D:-0}" == "1" ]] && WANDB_EXP_NAME+="-2d" +elif [[ "$MODE" == "prod" ]]; then + WANDB_EXP_NAME="nvfp4-pertensor" + # Additive suffixes: feature ON iff its DISABLE env is 0 (mirrors wrapper). + [[ "${NVTE_NVFP4_DISABLE_RHT:-1}" == "0" ]] && WANDB_EXP_NAME+="-rht" + [[ "${NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING:-1}" == "0" ]] && WANDB_EXP_NAME+="-sr" + [[ "${NVTE_NVFP4_DISABLE_2D_QUANTIZATION:-0}" == "1" ]] && WANDB_EXP_NAME+="-1d" +else + WANDB_EXP_NAME="bf16" +fi +# +fb: keep the first/last few transformer layers in bf16 (only meaningful for +# the quantized modes; no-op suffix for bf16). +[[ "${FIRST_LAST_BF16:-0}" == "1" && "$MODE" != "bf16" ]] && WANDB_EXP_NAME+="-fb" + +WORK_ROOT="${WORK_ROOT:-${SCRIPT_DIR}/work}" + +RUN_NAME="${RUN_NAME:-${WANDB_EXP_NAME}${RUN_TAG:+-${RUN_TAG}}-seed${SEED:-1234}}" + +WANDB_EXP_NAME="${WANDB_EXP_PREFIX-}${RUN_NAME}" +RUN_DIR="${WORK_ROOT}/${RUN_NAME}" +WANDB_SAVE_DIR="${WANDB_DIR:-${RUN_DIR}/wandb}" +TB_DIR="${TB_DIR:-${RUN_DIR}/tb}" +CKPT_DIR="${CKPT_DIR:-${RUN_DIR}/checkpoints}" +mkdir -p "$WANDB_SAVE_DIR" "$TB_DIR" "$CKPT_DIR" +echo "[run] work dir: $RUN_DIR (ckpt=$CKPT_DIR)" +echo "[run] wandb: project='$WANDB_PROJECT' exp=$WANDB_EXP_NAME (WANDB_MODE=${WANDB_MODE:-online})" + +LOG_ARGS=( + --wandb-project "$WANDB_PROJECT" + --wandb-exp-name "$WANDB_EXP_NAME" + --wandb-save-dir "$WANDB_SAVE_DIR" + # Disable OneLogger: with enable_one_logger=True it spins up a SECOND wandb + # run (project=one_logger_project="megatron-lm", auto name like "woven-leaf"), + # which becomes the active wandb.run and steals wandb.log() metrics from our + # named run -> loss never shows in our named run. Off => single wandb run. + --no-one-logger + --tensorboard-dir "$TB_DIR" + --tensorboard-log-interval 10 + --log-interval 10 + --log-num-zeros-in-grad + --eval-iters 0 + --eval-interval 100000 + # Checkpointing: stable per-variant dir, same path for save+load so each + # resubmission auto-resumes this variant. + # SAVE_INTERVAL how often to write a ckpt (def 2000). + # SAVE_RETAIN_INTERVAL rolling cleanup: mcore keeps the LATEST ckpt always + --save "$CKPT_DIR" + --load "$CKPT_DIR" + --save-interval "${SAVE_INTERVAL:-2000}" + --save-retain-interval "${SAVE_RETAIN_INTERVAL:-10000}" +) + +# --------------------------------------------------------------------------- +# Model: Ling-mini-v2 MoE block. All GEMM dims %128 (per-token alignment): +# hidden=2048, ffn=5120, moe-ffn=512, shared-expert=512, seq=4096. +# Routing: 64 experts / 8 groups (8 each), group_topk 4, topk 8 +# (8 <= group_topk*experts_per_group = 4*8 = 32). Reduced from Ling-mini-v2's +# 256 experts because EP=1 on one GPU exceeds TE's 64-tensor grouped cap. +# Layer 0 dense, layers 1-19 MoE (moe-layer-freq); num-layers 20 = full +# Ling-mini-v2 depth (fits one GB200 now that experts are down to 64). +# --------------------------------------------------------------------------- +MODEL_ARGS=( + --use-mcore-models + --transformer-impl transformer_engine + --untie-embeddings-and-output-weights + --disable-bias-linear + --swiglu + --position-embedding-type rope + --no-rope-fusion + --rotary-base 10000 + --rotary-percent 0.5 + --rotary-scaling-factor 40 + --normalization RMSNorm + --norm-epsilon 1e-6 + --group-query-attention + --num-attention-heads 16 + --num-query-groups 4 + --qk-layernorm + --hidden-dropout 0 + --attention-dropout 0 + --num-layers 20 + --hidden-size 2048 + --ffn-hidden-size 5120 + --seq-length 4096 + --max-position-embeddings 4096 + --no-masked-softmax-fusion + --attention-softmax-in-fp32 +) + +# MoE block based on Ling-mini-v2 (topk8 / 8 groups / group_topk4 / moe-ffn 512 / +# shared 512). num-layers 20 = full Ling-mini-v2 depth (fits one GB200 now that +# experts are down to 64). num-experts (64 vs 256) reduced because EP=1 on one +# GPU puts all experts local and TE's grouped-NVFP4 kernels cap at 64 +# tensors/launch (real training shards via EP>1 so local experts <=64). +# 8 groups * 8 experts/group = 64. +MOE_ARGS=( + --num-experts 64 + --moe-layer-freq "[0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]" + --moe-ffn-hidden-size 512 + --moe-shared-expert-intermediate-size 512 + --moe-grouped-gemm + --moe-router-load-balancing-type aux_loss + --moe-aux-loss-coeff 0.001 + --moe-z-loss-coeff 0.0000035 + --moe-router-topk 8 + --moe-router-topk-scaling-factor 2.5 + --moe-router-num-groups 8 + --moe-router-group-topk 4 + --moe-router-dtype fp32 + --moe-router-score-function sigmoid + --moe-router-enable-expert-bias + --moe-router-bias-update-rate 1e-3 + --moe-token-dispatcher-type alltoall + --moe-router-fusion + --moe-permute-fusion +) + +# --------------------------------------------------------------------------- +# Precision recipe (the part under test). +# --no-gradient-accumulation-fusion MANDATORY for per-token (accumulate=True +# unsupported); kept in all modes so only GEMM precision differs. +# --------------------------------------------------------------------------- +QUANT_ARGS=( + --bf16 + --no-gradient-accumulation-fusion +) +if [[ "$MODE" != "bf16" ]]; then + QUANT_ARGS+=(--fp4-format e2m1) +fi +# +fb variant: keep the first/last few transformer layers in bf16 (skip NVFP4) +# via Megatron's --first-last-layers-bf16. Gated by FIRST_LAST_BF16=1 (set by the +# sbatch wrapper for a +fb spec). Layer counts overridable; defaults 0 start / 3 end. +if [[ "${FIRST_LAST_BF16:-0}" == "1" && "$MODE" != "bf16" ]]; then + QUANT_ARGS+=( + --first-last-layers-bf16 + --num-layers-at-start-in-bf16 "${NUM_LAYERS_START_BF16:-0}" + --num-layers-at-end-in-bf16 "${NUM_LAYERS_END_BF16:-3}" + ) + echo "[run] FIRST_LAST_BF16=1 -> first ${NUM_LAYERS_START_BF16:-0} / last ${NUM_LAYERS_END_BF16:-3} layers in bf16" +fi + +# --------------------------------------------------------------------------- +# Parallelism: single GPU. EP/TP/PP/CP=1. No sequence-parallel (needs TP>1), +# no expert/comm overlap. +# --------------------------------------------------------------------------- +PARALLEL_ARGS=( + --tensor-model-parallel-size 1 + --pipeline-model-parallel-size 1 + --expert-model-parallel-size 1 + --expert-tensor-parallel-size 1 + --use-distributed-optimizer +) + +# --------------------------------------------------------------------------- +# Data / tokenizer. DATA_MODE=real (default) uses the downloaded OLMo-1124 +# algebraic_stack corpus; DATA_MODE=mock falls back to synthetic data (no +# files, loss won't converge -- only for a pure "does it run" smoke test). +# Override DATA_PATH / TOKENIZER_MODEL via env if you use a different corpus. +# --------------------------------------------------------------------------- +DATA_MODE="${DATA_MODE:-real}" +DATA_PATH="${DATA_PATH:-/path/to/datasets/olmo-1124/algebraic_stack_text_document}" +# Tokenizer that ORIGINALLY preprocessed this corpus = Moonlight-16B-A3B (custom +# tiktoken tokenizer -> needs --trust-remote-code). Download tokenizer-only via: +# hf download moonshotai/Moonlight-16B-A3B --include "tiktoken.model" \ +# "tokenization_moonshot.py" "tokenizer_config.json" --local-dir +TOKENIZER_MODEL="${TOKENIZER_MODEL:-/path/to/tokenizers/Moonlight-16B-A3B}" +if [[ "$DATA_MODE" == "mock" ]]; then + echo "[run] DATA_MODE=mock -> synthetic data (NullTokenizer, no convergence)" + DATA_ARGS=( + --mock-data + --tokenizer-type NullTokenizer + --vocab-size 32000 + --make-vocab-size-divisible-by 128 + ) +else + echo "[run] DATA_MODE=real -> $DATA_PATH (tokenizer=$TOKENIZER_MODEL)" + DATA_ARGS=( + --data-path "$DATA_PATH" + --split 99,1,0 + --tokenizer-type HuggingFaceTokenizer + --tokenizer-model "$TOKENIZER_MODEL" + --trust-remote-code + --make-vocab-size-divisible-by 128 + ) +fi + +# --------------------------------------------------------------------------- +# Schedule: gbs/mbs = 8 microbatches/step. Steps via TRAIN_ITERS (def 20000). +# --------------------------------------------------------------------------- +TRAIN_ITERS="${TRAIN_ITERS:-20000}" +LR_WARMUP_ITERS=$(( TRAIN_ITERS * 3 / 100 )) +SEED="${SEED:-1234}" +export WANDB_TAGS="seed=${SEED}${WANDB_TAGS:+,${WANDB_TAGS}}" +echo "[run] wandb tags: $WANDB_TAGS" +# torchrun rendezvous port: must be UNIQUE per concurrent run on the same node +# (running bf16/prod/pertoken in parallel on different GPUs needs different +# ports, else they collide on the default 29502). Overridable via MASTER_PORT. +MASTER_PORT="${MASTER_PORT:-29502}" +TRAIN_ARGS=( + --seed "$SEED" + --micro-batch-size 1 + --global-batch-size 8 + --train-iters "$TRAIN_ITERS" + --lr 1e-4 + --min-lr 1e-5 + --lr-decay-style cosine + --lr-warmup-iters "$LR_WARMUP_ITERS" + --lr-decay-iters "$TRAIN_ITERS" + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.95 + --clip-grad 1.0 + --init-method-std 0.02 + --attention-backend auto +) + +# Wall-clock guard for SLURM job chaining: when set, mcore saves a checkpoint +# and exits cleanly once training has run EXIT_DURATION_MIN minutes (counted from +# train start, NOT job start -> leave headroom for container startup + final +# save under the #SBATCH --time wall). The next (dependent) job resumes from the +# ckpt. Empty/0 -> disabled (run until --train-iters). See submit_chain.sh. +EXIT_ARGS=() +if [[ -n "${EXIT_DURATION_MIN:-}" && "${EXIT_DURATION_MIN}" != "0" ]]; then + EXIT_ARGS+=(--exit-duration-in-mins "$EXIT_DURATION_MIN") + echo "[run] exit-duration-in-mins=$EXIT_DURATION_MIN (will save+exit, resume via --load on next job)" +fi + +echo "[run] launching pretrain_gpt.py MoE ($MODE) on CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-} port=$MASTER_PORT ..." +set -x +torchrun --nproc_per_node=1 --nnodes=1 \ + --master_addr=127.0.0.1 --master_port="$MASTER_PORT" \ + pretrain_gpt.py \ + "${MODEL_ARGS[@]}" \ + "${MOE_ARGS[@]}" \ + "${QUANT_ARGS[@]}" \ + "${PARALLEL_ARGS[@]}" \ + "${DATA_ARGS[@]}" \ + "${TRAIN_ARGS[@]}" \ + "${EXIT_ARGS[@]}" \ + "${LOG_ARGS[@]}" diff --git a/examples/pytorch/nvfp4_per_token_megatron/sbatch_moe_nvfp4_singlegpu.sh b/examples/pytorch/nvfp4_per_token_megatron/sbatch_moe_nvfp4_singlegpu.sh new file mode 100755 index 0000000000..60de67e4ce --- /dev/null +++ b/examples/pytorch/nvfp4_per_token_megatron/sbatch_moe_nvfp4_singlegpu.sh @@ -0,0 +1,301 @@ +#!/bin/bash + +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +# ============================================================================ +# sbatch wrapper for the single-GPU MoE NVFP4 smoke run. +# Submits a batch job that: launches the container, applies per-container +# hygiene (flash-attn / huggingface-hub), (re)registers the editable TE build +# from the workspace, then runs run_moe_nvfp4_singlegpu.sh -- so you never have +# to srun --pty into the node by hand. +# +# Usage: +# sbatch sbatch_moe_nvfp4_singlegpu.sh # runs ALL 3 modes sequentially (1 GPU) +# sbatch sbatch_moe_nvfp4_singlegpu.sh bf16 # just bf16 +# sbatch sbatch_moe_nvfp4_singlegpu.sh prod +# sbatch sbatch_moe_nvfp4_singlegpu.sh pertoken +# Pick EXACTLY which variants run concurrently (one GPU each) with a comma list. +# QOS forces a full 4-GPU node per job, so packing up to 4 variants wastes nothing: +# sbatch sbatch_moe_nvfp4_singlegpu.sh "bf16,pertensor+rht+sr,pertoken" +# sbatch sbatch_moe_nvfp4_singlegpu.sh "pertensor,pertensor+rht,pertensor+sr,pertensor+rht+sr" +# sbatch sbatch_moe_nvfp4_singlegpu.sh "pertoken,pertoken+sr,pertoken+rht,pertoken+rht+sr" +# spec syntax: [+rht][+sr][+1d][+2d][+fb] where mode = bf16 | prod(==pertensor) | pertoken +# +fb -> keep first/last layers in bf16 (Megatron --first-last-layers-bf16; +# defaults: 0 at start, 3 at end; override via NUM_LAYERS_START_BF16 / +# NUM_LAYERS_END_BF16). e.g. "pertoken+2d+fb", "pertensor+rht+sr+fb". +# ADDITIVE RHT/SR: +rht/+sr turn the feature ON; bare = OFF for BOTH paths. +# -> bare `pertensor`/`prod` = block-scaling only (NO RHT/SR); +# -> production-default per-tensor = `pertensor+rht+sr`; +# -> per-token default (no RHT/SR) = `pertoken`. +# each spec gets GPU i and port BASE_PORT+i; <=4 specs (one per GPU). +# Convenience aliases: +# parallel == "bf16,pertensor+rht+sr,pertoken" +# parallel4 == "bf16,pertensor+rht+sr,pertoken,pertoken+sr" +# +# Pin a single mode to a specific GPU with GPU_ID: +# sbatch --export=ALL,GPU_ID=2 sbatch_moe_nvfp4_singlegpu.sh pertoken +# +# Override knobs via --export, e.g.: +# sbatch --export=ALL,TRAIN_ITERS=2000 sbatch_moe_nvfp4_singlegpu.sh pertoken +# sbatch --export=ALL,SKIP_BUILD=1 sbatch_moe_nvfp4_singlegpu.sh prod +# sbatch --export=ALL,SEED=42 sbatch_moe_nvfp4_singlegpu.sh prod +# sbatch --export=ALL,PT_SR=1,PT_RHT=1 sbatch_moe_nvfp4_singlegpu.sh pertoken +# ============================================================================ +#SBATCH -N 1 +#SBATCH -p batch +#SBATCH -q short +#SBATCH -A your_slurm_account # EDIT: your Slurm account +#SBATCH --gres=gpu:4 +#SBATCH --time=2:00:00 +#SBATCH -J nvfp4-moe-singlegpu +# NOTE: #SBATCH --output/--error are resolved by Slurm on the HOST filesystem +# (the batch body runs on the host, outside the container), so these MUST be the +# real host path, NOT the in-container mount target. EDIT to a writable host dir. +#SBATCH --output=/path/to/TransformerEngine/examples/pytorch/nvfp4_per_token_megatron/slurm_logs/nvfp4-moe-%j.out +#SBATCH --error=/path/to/TransformerEngine/examples/pytorch/nvfp4_per_token_megatron/slurm_logs/nvfp4-moe-%j.err + +set -euo pipefail + +IMAGE="${IMAGE:-/path/to/te_container_image.sqsh}" +HOST_MOUNT="${HOST_MOUNT:-/path/to/host/workspace:/workspace}" # host:container +TE_DIR="${TE_DIR:-/workspace/TransformerEngine}" # in-container path +EXAMPLE_DIR="${EXAMPLE_DIR:-${TE_DIR}/examples/pytorch/nvfp4_per_token_megatron}" +RUN_SCRIPT="${EXAMPLE_DIR}/run_moe_nvfp4_singlegpu.sh" +# HOST_LOG_DIR is the HOST path matching the #SBATCH --output/--error dir above. +HOST_LOG_DIR="${HOST_LOG_DIR:-/path/to/TransformerEngine/examples/pytorch/nvfp4_per_token_megatron/slurm_logs}" + +# slurm_logs must exist on the HOST before the job writes there. +mkdir -p "$HOST_LOG_DIR" + +# --- Knobs (overridable via sbatch --export) ------------------------------- +export RUN_MODE="${1:-all}" # single mode | all | "a,b,c" list | parallel | parallel4 +export SKIP_BUILD="${SKIP_BUILD:-0}" # 1 = skip the pip install -e . step +export TRAIN_ITERS="${TRAIN_ITERS:-20000}" +export SEED="${SEED:-1234}" # same seed across modes for a fair compare +export GPU_ID="${GPU_ID:-}" # pin a single-mode run to this GPU (e.g. 2) +export BASE_PORT="${BASE_PORT:-29502}" # torchrun rendezvous base port (parallel uses +0/+1/+2) +export RUN_TAG="${RUN_TAG:-}" # optional suffix on the run name (e.g. an experiment label) +export SAVE_INTERVAL="${SAVE_INTERVAL:-2000}" # checkpoint every N iters (stable per-variant dir) +export SAVE_RETAIN_INTERVAL="${SAVE_RETAIN_INTERVAL:-10000}" +export EXIT_DURATION_MIN="${EXIT_DURATION_MIN:-110}" +export PT_SR="${PT_SR:-0}" # 1 = NVTE_NVFP4_PER_TOKEN_SR=1 (pertoken only) +export PT_RHT="${PT_RHT:-0}" # 1 = NVTE_NVFP4_PER_TOKEN_RHT=1 (pertoken only) +# Carry the in-container paths into the container env (--export=ALL below). +export TE_DIR EXAMPLE_DIR RUN_SCRIPT + +echo "[sbatch] job=$SLURM_JOB_ID node=$(hostname) mode=$RUN_MODE iters=$TRAIN_ITERS seed=$SEED save_interval=$SAVE_INTERVAL retain_interval=$SAVE_RETAIN_INTERVAL exit_min=$EXIT_DURATION_MIN gpu_id=${GPU_ID:-} skip_build=$SKIP_BUILD" + +# --- Run everything inside the container ----------------------------------- +srun --container-image="$IMAGE" \ + --container-writable \ + --container-mounts="$HOST_MOUNT" \ + --container-remap-root \ + --container-workdir="/workspace" \ + --export=ALL \ + bash <<'EOF' +set -euo pipefail + +# TE_DIR / EXAMPLE_DIR / RUN_SCRIPT are inherited from the host via --export=ALL. +: "${TE_DIR:?TE_DIR not set -- edit the host-side config block}" +EXAMPLE_DIR="${EXAMPLE_DIR:-${TE_DIR}/examples/pytorch/nvfp4_per_token_megatron}" +RUN_SCRIPT="${RUN_SCRIPT:-${EXAMPLE_DIR}/run_moe_nvfp4_singlegpu.sh}" + +# 1. Per-container hygiene ----------------------------------------------------- +# (a) flash-attn ABI mismatch -> remove it (TE skips the FA backend cleanly) +pip uninstall -y flash-attn flash_attn flash_attn_2_cuda >/dev/null 2>&1 || true +# (b) transformers needs huggingface-hub <1.0 (image often ships 1.x) +python - <<'PY' 2>/dev/null || pip install -q "huggingface_hub>=0.34,<1.0" +import sys +from importlib.metadata import version +from packaging.version import parse +sys.exit(0 if parse(version("huggingface_hub")) < parse("1.0") else 1) +PY + +# 2. (Re)register the editable TE build from the workspace -------------------- +if [[ "${SKIP_BUILD:-0}" != "1" ]]; then + echo "[job] (re)installing editable TE from $TE_DIR ..." + cd "$TE_DIR" + NVTE_CUDA_ARCHS=100a NVTE_BUILD_THREADS_PER_JOB=8 NVTE_FRAMEWORK=pytorch \ + pip install -e . --no-build-isolation 2>&1 | tee "build_sbatch_${SLURM_JOB_ID}.log" +else + echo "[job] SKIP_BUILD=1 -> assuming TE already registered" +fi + +# Sanity: TE must import AND expose a per-token symbol +python - <<'PY' +import transformer_engine # libtransformer_engine.so first +import transformer_engine_torch as tex +assert hasattr(tex, "nvfp4_per_token_quantize") or hasattr(tex, "nvfp4_cutlass_per_token_gemm"), \ + "per-token TE symbols missing -> wrong/old TE is active" +print("[job] TE per-token symbols present: OK") +PY + +# 3. Run the experiment(s) --------------------------------------------------- + +WORK_ROOT="${WORK_ROOT:-${EXAMPLE_DIR}/work}" + +# Map (mode, sr, rht, oned) -> run name (mirrors run_moe_nvfp4_singlegpu.sh exp +# names) so the wrapper's console log lands in the SAME per-run dir as wandb/tb. +# exp_name [sr] [rht] [oned] +# oned=1 (prod/pertensor only) forces 1D weight quant (disables the default 2D) +# for the per-tensor-1d-vs-2d weight-quant ablation; named with a -1d suffix. +exp_name () { + local mode="$1" sr="${2:-${PT_SR:-0}}" rht="${3:-${PT_RHT:-0}}" oned="${4:-0}" twod="${5:-0}" fb="${6:-0}" base + case "$mode" in + pertoken) base="nvfp4-pertoken" ;; + prod|pertensor) base="nvfp4-pertensor" ;; + *) echo "bf16"; return ;; + esac + [[ "$rht" == "1" ]] && base+="-rht" + [[ "$sr" == "1" ]] && base+="-sr" + # -1d only meaningful for per-tensor (per-token weights are already 1D). + [[ "$oned" == "1" && "$mode" != "pertoken" ]] && base+="-1d" + # -2d only meaningful for per-token (Route A: 2D weight quant via + # NVTE_NVFP4_PER_TOKEN_WEIGHT_2D). No-op on per-tensor (already 2D). + [[ "$twod" == "1" && "$mode" == "pertoken" ]] && base+="-2d" + # -fb: keep first/last layers in bf16 (Megatron --first-last-layers-bf16). + [[ "$fb" == "1" ]] && base+="-fb" + echo "$base" +} + +# Emit the env-var assignments that realize (sr,rht,oned) for a given mode. +# mode_env [oned] +mode_env () { + local mode="$1" sr="$2" rht="$3" oned="${4:-0}" twod="${5:-0}" + case "$mode" in + pertoken) + # twod (Route A) opts the forward WEIGHT into prod 2D block scaling + # (16x16 inner + scalar outer) while activations/gradients stay + # per-token 1D. Safe to combine with rht/sr: those only touch + # act/grad, the fwd weight is always no-rht/no-sr. + echo "NVTE_NVFP4_PER_TOKEN_RHT=$rht NVTE_NVFP4_PER_TOKEN_SR=$sr NVTE_NVFP4_PER_TOKEN_WEIGHT_2D=$twod" ;; + prod|pertensor) + # disable=1 when the feature is OFF (additive: feature ON iff flag==1). + # 2D weight quant is ON by default for prod; +1d disables it (1D weights). + echo "NVTE_NVFP4_DISABLE_RHT=$(( rht == 1 ? 0 : 1 )) NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING=$(( sr == 1 ? 0 : 1 )) NVTE_NVFP4_DISABLE_2D_QUANTIZATION=$(( oned == 1 ? 1 : 0 ))" ;; + *) echo "" ;; + esac +} +# run_dir_for [sr] [rht] [oned] [twod] [fb] -> work/-seed +# (stable, no timestamp; seed in the dir name so different seeds never collide) +run_dir_for () { echo "${WORK_ROOT}/$(exp_name "$1" "${2:-}" "${3:-}" "${4:-}" "${5:-}" "${6:-}")${RUN_TAG:+-${RUN_TAG}}-seed${SEED}"; } + +# Foreground single run (optionally pinned to GPU_ID). Console log -> work dir. +run_one () { + local spec="$1" mode sr rht oned twod fb + # Parse +rht/+sr/+1d/+2d/+fb suffixes so a single spec works the same as in + # launch_list (e.g. "pertoken+rht", "pertensor+1d", "pertoken+2d+fb"). Fall + # back to PT_SR/PT_RHT when no suffix is given. + mode="${spec%%+*}" + sr="${PT_SR:-0}"; rht="${PT_RHT:-0}"; oned=0; twod=0; fb=0 + [[ "$spec" == *"+sr"* ]] && sr=1 + [[ "$spec" == *"+rht"* ]] && rht=1 + [[ "$spec" == *"+1d"* ]] && oned=1 + [[ "$spec" == *"+2d"* ]] && twod=1 + [[ "$spec" == *"+fb"* ]] && fb=1 + [[ "$mode" == "pertensor" ]] && mode="prod" + case "$mode" in + bf16|prod|pertoken) ;; + *) echo "[job] ERROR: bad mode '$mode' in spec '$spec' (want bf16|prod|pertoken[+rht][+sr][+1d][+2d][+fb])" >&2; exit 1 ;; + esac + local run_name; run_name="$(exp_name "$mode" "$sr" "$rht" "$oned" "$twod" "$fb")${RUN_TAG:+-${RUN_TAG}}-seed${SEED}" + local run_dir="${WORK_ROOT}/${run_name}" + mkdir -p "$run_dir" + echo "============================================================" + echo "[job] === running mode=$mode sr=$sr rht=$rht oned=$oned twod=$twod fb=$fb (iters=$TRAIN_ITERS, GPU=${GPU_ID:-}) -> $run_dir ===" + echo "============================================================" + local gpu_env=() + [[ -n "${GPU_ID:-}" ]] && gpu_env=(CUDA_VISIBLE_DEVICES="$GPU_ID") + env "${gpu_env[@]}" TRAIN_ITERS="$TRAIN_ITERS" SEED="$SEED" MASTER_PORT="$BASE_PORT" \ + SAVE_INTERVAL="$SAVE_INTERVAL" SAVE_RETAIN_INTERVAL="$SAVE_RETAIN_INTERVAL" \ + EXIT_DURATION_MIN="$EXIT_DURATION_MIN" \ + $(mode_env "$mode" "$sr" "$rht" "$oned" "$twod") \ + FIRST_LAST_BF16="$fb" \ + WORK_ROOT="$WORK_ROOT" RUN_NAME="$run_name" \ + bash "$RUN_SCRIPT" "$mode" 2>&1 | tee "${run_dir}/console.log" +} + +# Background run pinned to a specific GPU + port; everything in its own work dir. +# Per-run SR/RHT (args 4/5) override the global PT_SR/PT_RHT so one job can run +# several pertoken variants side by side. +# run_bg [sr] [rht] [oned] [twod] [fb] +run_bg () { + local mode="$1" gpu="$2" port="$3" sr="${4:-${PT_SR:-0}}" rht="${5:-${PT_RHT:-0}}" oned="${6:-0}" twod="${7:-0}" fb="${8:-0}" + local run_name; run_name="$(exp_name "$mode" "$sr" "$rht" "$oned" "$twod" "$fb")${RUN_TAG:+-${RUN_TAG}}-seed${SEED}" + local run_dir="${WORK_ROOT}/${run_name}" + mkdir -p "$run_dir" + echo "[job] launch mode=$mode sr=$sr rht=$rht oned=$oned twod=$twod fb=$fb on GPU $gpu (port $port) -> $run_dir/console.log" + env CUDA_VISIBLE_DEVICES="$gpu" TRAIN_ITERS="$TRAIN_ITERS" SEED="$SEED" MASTER_PORT="$port" \ + SAVE_INTERVAL="$SAVE_INTERVAL" SAVE_RETAIN_INTERVAL="$SAVE_RETAIN_INTERVAL" \ + EXIT_DURATION_MIN="$EXIT_DURATION_MIN" \ + $(mode_env "$mode" "$sr" "$rht" "$oned" "$twod") \ + FIRST_LAST_BF16="$fb" \ + WORK_ROOT="$WORK_ROOT" RUN_NAME="$run_name" \ + bash "$RUN_SCRIPT" "$mode" >"${run_dir}/console.log" 2>&1 & +} + +N_GPUS="${N_GPUS:-4}" # GPUs available on the node (QOS gives a full 4-GPU node) + +# Launch a list of run specs concurrently, one GPU each. +# Each spec: [+rht][+sr] e.g. bf16, pertensor+rht+sr, pertoken, pertoken+sr +launch_list () { + local specs=("$@") + local n=${#specs[@]} + if (( n > N_GPUS )); then + echo "[job] ERROR: requested $n runs but only $N_GPUS GPUs available" >&2 + echo "[job] specs: ${specs[*]}" >&2 + exit 1 + fi + local i=0 spec mode sr rht oned twod fb + for spec in "${specs[@]}"; do + mode="${spec%%+*}"; sr=0; rht=0; oned=0; twod=0; fb=0 + [[ "$spec" == *"+sr"* ]] && sr=1 + [[ "$spec" == *"+rht"* ]] && rht=1 + [[ "$spec" == *"+1d"* ]] && oned=1 + [[ "$spec" == *"+2d"* ]] && twod=1 + [[ "$spec" == *"+fb"* ]] && fb=1 + [[ "$mode" == "pertensor" ]] && mode="prod" # alias + case "$mode" in + bf16|prod|pertoken) ;; + *) echo "[job] ERROR: bad mode '$mode' in spec '$spec' (want bf16|prod|pertoken[+rht][+sr][+1d][+2d][+fb])" >&2; exit 1 ;; + esac + run_bg "$mode" "$i" "$((BASE_PORT + i))" "$sr" "$rht" "$oned" "$twod" "$fb" + i=$((i + 1)) + done + local rc=0 pid + for pid in $(jobs -p); do wait "$pid" || rc=1; done + echo "[job] runs finished (rc=$rc). Per-run dirs (console.log + wandb/ + tb/):" + for spec in "${specs[@]}"; do + mode="${spec%%+*}"; sr=0; rht=0; oned=0; twod=0; fb=0 + [[ "$spec" == *"+sr"* ]] && sr=1 + [[ "$spec" == *"+rht"* ]] && rht=1 + [[ "$spec" == *"+1d"* ]] && oned=1 + [[ "$spec" == *"+2d"* ]] && twod=1 + [[ "$spec" == *"+fb"* ]] && fb=1 + echo " $(run_dir_for "$mode" "$sr" "$rht" "$oned" "$twod" "$fb")" + done + exit "$rc" +} + +# Aliases expand to a spec list; a comma-separated RUN_MODE is taken verbatim. +case "$RUN_MODE" in + parallel) RUN_MODE="bf16,pertensor+rht+sr,pertoken" ;; + parallel4) RUN_MODE="bf16,pertensor+rht+sr,pertoken,pertoken+sr" ;; +esac + +if [[ "$RUN_MODE" == *","* ]]; then + IFS=',' read -ra specs <<< "$RUN_MODE" + echo "[job] concurrent runs (GPU 0..$((${#specs[@]} - 1))): ${specs[*]}" + launch_list "${specs[@]}" +elif [[ "$RUN_MODE" == "all" ]]; then + for m in bf16 prod pertoken; do run_one "$m"; done +else + run_one "$RUN_MODE" +fi + +echo "[job] done." +EOF + +echo "[sbatch] job $SLURM_JOB_ID finished." diff --git a/examples/pytorch/nvfp4_per_token_megatron/submit_chain.sh b/examples/pytorch/nvfp4_per_token_megatron/submit_chain.sh new file mode 100755 index 0000000000..fcad8c845f --- /dev/null +++ b/examples/pytorch/nvfp4_per_token_megatron/submit_chain.sh @@ -0,0 +1,48 @@ +#!/bin/bash + +# Copyright (c) 2022-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See LICENSE for license information. + +# ============================================================================ +# Usage (run on the LOGIN node, not inside a container): +# CHAIN=3 bash submit_chain.sh \ +# --export=ALL,IMAGE=/path/to/te_pertoken.sqsh,SKIP_BUILD=1,TRAIN_ITERS=60000 \ +# sbatch_moe_nvfp4_singlegpu.sh pertoken +# ============================================================================ +set -euo pipefail + +CHAIN="${CHAIN:-2}" +DEP_TYPE="${DEP_TYPE:-afterany}" + +if [[ $# -lt 1 ]]; then + echo "usage: CHAIN= bash submit_chain.sh