Skip to content

Commit 9acbdab

Browse files
committed
Add HybridEP support for GB200 NVL72
- Add deep_ep aarch64 dependency (7febc6e2, hybrid-ep branch) - Add HybridEP setup in megatron setup.py (IMEX env vars, NVLink domain config) - Add Qwen3-30B-A3B 4n4g config with moe_flex_dispatcher_backend=hybridep - Add EP=4, EP=8, sms16 config variants for ablation testing - Add test scripts for EP variants - Update Dockerfile for aarch64 deep_ep build support - Add HybridEP settings to 235B and DeepSeek-V3 performance configs Signed-off-by: sna <sna@nvidia.com>
1 parent cf58b93 commit 9acbdab

16 files changed

Lines changed: 566 additions & 99 deletions

docker/Dockerfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,10 @@ ENV UV_LINK_MODE=copy
112112
# Ensure DeepEP is built for H100 and B200 (also mcore inference unified memory API now invokes a torch API that requires these to be set)
113113
ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"
114114

115+
# Ensure HybridEP/DeepEP JIT compilation can find nvcc at runtime
116+
ENV CUDA_HOME=/usr/local/cuda
117+
118+
115119
# First copy only the dependency files
116120
COPY --from=nemo-rl pyproject.toml uv.lock ./
117121
# Copy in the top level __init__.py/package_info.py since build-custom-vllm.sh needs the nemo_rl package to exist.

docker/Dockerfile.ngc_pytorch

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
9090
# Ensure DeepEP is built for H100 and B200
9191
ENV TORCH_CUDA_ARCH_LIST="9.0 10.0"
9292

93+
# Ensure HybridEP/DeepEP JIT compilation can find nvcc at runtime
94+
ENV CUDA_HOME=/usr/local/cuda
95+
96+
9397
# First copy only the dependency files
9498
COPY --from=nemo-rl pyproject.toml uv.lock ./
9599
COPY --from=nemo-rl --link 3rdparty/ ./3rdparty/

examples/configs/recipes/llm/performance/grpo-deepseek-v3-32n4g.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ policy:
99
expert_model_parallel_size: 16
1010
num_layers_in_first_pipeline_stage: 7
1111
num_layers_in_last_pipeline_stage: 6
12+
# HybridEP settings
13+
moe_token_dispatcher_type: flex
14+
moe_flex_dispatcher_backend: hybridep
15+
moe_hybridep_num_sms: 32
1216
generation:
1317
vllm_cfg:
1418
tensor_parallel_size: 32

examples/configs/recipes/llm/performance/grpo-qwen3-235b-16n4g.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@ policy:
66
pipeline_model_parallel_size: 4
77
num_layers_in_first_pipeline_stage: 23
88
num_layers_in_last_pipeline_stage: 23
9+
# HybridEP settings
10+
moe_token_dispatcher_type: flex
11+
moe_flex_dispatcher_backend: hybridep
12+
moe_hybridep_num_sms: 32
913
generation:
1014
vllm_cfg:
1115
tensor_parallel_size: 8
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
defaults: ./grpo-qwen3-30ba3b-4n4g.yaml
2+
checkpointing:
3+
checkpoint_dir: results/grpo-qwen3-30ba3b-4n4g-ep4
4+
policy:
5+
megatron_cfg:
6+
expert_model_parallel_size: 4
7+
logger:
8+
log_dir: logs/grpo-qwen3-30ba3b-4n4g-ep4
9+
wandb:
10+
name: grpo-qwen3-30ba3b-4n4g-ep4
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
defaults: ./grpo-qwen3-30ba3b-4n4g.yaml
2+
checkpointing:
3+
checkpoint_dir: results/grpo-qwen3-30ba3b-4n4g-ep8
4+
policy:
5+
megatron_cfg:
6+
expert_model_parallel_size: 8
7+
logger:
8+
log_dir: logs/grpo-qwen3-30ba3b-4n4g-ep8
9+
wandb:
10+
name: grpo-qwen3-30ba3b-4n4g-ep8
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
defaults: ./grpo-qwen3-30ba3b-4n4g.yaml
2+
checkpointing:
3+
checkpoint_dir: results/grpo-qwen3-30ba3b-4n4g-sms16
4+
policy:
5+
megatron_cfg:
6+
moe_hybridep_num_sms: 16
7+
logger:
8+
log_dir: logs/grpo-qwen3-30ba3b-4n4g-sms16
9+
wandb:
10+
name: grpo-qwen3-30ba3b-4n4g-sms16

examples/configs/recipes/llm/performance/grpo-qwen3-30ba3b-4n4g.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@ policy:
2121
pipeline_model_parallel_size: 1
2222
expert_model_parallel_size: 16
2323
sequence_parallel: false
24+
# HybridEP settings
25+
moe_token_dispatcher_type: flex
26+
moe_flex_dispatcher_backend: hybridep
27+
moe_hybridep_num_sms: 32
2428
optimizer:
2529
lr: 3.0e-07
2630
min_lr: 3.0e-08
@@ -42,4 +46,3 @@ logger:
4246
cluster:
4347
gpus_per_node: 4
4448
num_nodes: 4
45-

nemo_rl/models/megatron/setup.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -477,6 +477,49 @@ def _apply_moe_config(model_cfg: Any, config: PolicyConfig) -> None:
477477
"moe_shared_expert_overlap"
478478
]
479479

480+
# HybridEP settings for MoE expert parallelism
481+
# See: https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep
482+
if "moe_flex_dispatcher_backend" in config["megatron_cfg"]:
483+
model_cfg.moe_flex_dispatcher_backend = config["megatron_cfg"][
484+
"moe_flex_dispatcher_backend"
485+
]
486+
if "moe_hybridep_num_sms" in config["megatron_cfg"]:
487+
model_cfg.moe_hybridep_num_sms = config["megatron_cfg"]["moe_hybridep_num_sms"]
488+
489+
# HybridEP environment variables
490+
# These are required by DeepEP's hybrid-ep branch for NVLink domain configuration.
491+
# Users can set them explicitly via config, or they will be auto-computed with a warning.
492+
if config["megatron_cfg"].get("moe_flex_dispatcher_backend") == "hybridep":
493+
ep_size = model_cfg.expert_model_parallel_size
494+
495+
# NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN
496+
if "hybridep_num_ranks_per_nvlink_domain" in config["megatron_cfg"]:
497+
val = config["megatron_cfg"]["hybridep_num_ranks_per_nvlink_domain"]
498+
os.environ["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = str(val)
499+
elif "NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN" not in os.environ:
500+
default_val = min(ep_size, 64)
501+
os.environ["NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN"] = str(default_val)
502+
warnings.warn(
503+
f"HybridEP: NUM_OF_HYBRID_EP_RANKS_PER_NVLINK_DOMAIN not configured. "
504+
f"Auto-setting to min(expert_model_parallel_size={ep_size}, 64) = {default_val}. "
505+
f"Set 'hybridep_num_ranks_per_nvlink_domain' in megatron_cfg to override.",
506+
stacklevel=2,
507+
)
508+
509+
# USE_MNNVL
510+
if "hybridep_use_mnnvl" in config["megatron_cfg"]:
511+
val = config["megatron_cfg"]["hybridep_use_mnnvl"]
512+
os.environ["USE_MNNVL"] = str(int(val))
513+
elif "USE_MNNVL" not in os.environ:
514+
default_val = int(ep_size > 4)
515+
os.environ["USE_MNNVL"] = str(default_val)
516+
warnings.warn(
517+
f"HybridEP: USE_MNNVL not configured. "
518+
f"Auto-setting to int(expert_model_parallel_size={ep_size} > 4) = {default_val}. "
519+
f"Set 'hybridep_use_mnnvl' in megatron_cfg to override.",
520+
stacklevel=2,
521+
)
522+
480523
model_cfg.moe_permute_fusion = config["megatron_cfg"]["moe_permute_fusion"]
481524

482525

nemo_rl/models/policy/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,14 @@ class MegatronConfig(TypedDict):
236236
moe_token_dispatcher_type: str
237237
# Can be used only with 'alltoall' token dispatcher
238238
moe_shared_expert_overlap: bool
239+
# HybridEP settings for MoE expert parallelism (requires moe_token_dispatcher_type='flex')
240+
# See: https://github.com/deepseek-ai/DeepEP/tree/hybrid-ep
241+
moe_flex_dispatcher_backend: NotRequired[str]
242+
moe_hybridep_num_sms: NotRequired[int]
243+
# Number of HybridEP ranks per NVLink domain (default: min(expert_model_parallel_size, 64))
244+
hybridep_num_ranks_per_nvlink_domain: NotRequired[int]
245+
# Enable multi-node NVLink support (default: expert_model_parallel_size > 4)
246+
hybridep_use_mnnvl: NotRequired[bool]
239247
peft: NotRequired[MegatronPeftConfig | MegatronPeftConfigDisabled]
240248
optimizer: MegatronOptimizerConfig
241249
scheduler: MegatronSchedulerConfig

0 commit comments

Comments
 (0)