NVIDIA-NeMo
diff --git a/‎examples/diffusion/recipes/wan/README.md‎
Lines changed: 3 additions & 3 deletions b/‎examples/diffusion/recipes/wan/README.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/diffusion/recipes/wan/conf/gb200_perf_pretrain_mock.yaml‎
Lines changed: 0 additions & 33 deletions b/‎examples/diffusion/recipes/wan/conf/gb200_perf_pretrain_mock.yaml‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎examples/diffusion/recipes/wan/conf/gb300_perf_pretrain_mock.yaml‎
Lines changed: 0 additions & 33 deletions b/‎examples/diffusion/recipes/wan/conf/gb300_perf_pretrain_mock.yaml‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎examples/diffusion/recipes/wan/conf/h100_perf_pretrain_mock.yaml‎
Lines changed: 0 additions & 37 deletions b/‎examples/diffusion/recipes/wan/conf/h100_perf_pretrain_mock.yaml‎
Lines changed: 0 additions & 37 deletions
diff --git a/‎examples/diffusion/recipes/wan/prepare_dataset/openvid1M_dataset/README.md‎
Lines changed: 1 addition & 1 deletion b/‎examples/diffusion/recipes/wan/prepare_dataset/openvid1M_dataset/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/performance/argument_parser.py‎
Lines changed: 1 addition & 1 deletion b/‎scripts/performance/argument_parser.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/performance/configs/wan/__init__.py‎
Lines changed: 45 additions & 0 deletions b/‎scripts/performance/configs/wan/__init__.py‎
Lines changed: 45 additions & 0 deletions
diff --git a/‎scripts/performance/configs/wan/wan_diffusion_pretrain.py‎
Lines changed: 61 additions & 0 deletions b/‎scripts/performance/configs/wan/wan_diffusion_pretrain.py‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎scripts/performance/configs/wan/wan_workload_base_configs.py‎
Lines changed: 55 additions & 0 deletions b/‎scripts/performance/configs/wan/wan_workload_base_configs.py‎
Lines changed: 55 additions & 0 deletions
@@ -146,15 +146,15 @@ WAN uses different flow-matching hyperparameters for pretraining vs fine-tuning.
 
 ```bash
 uv run torchrun --nproc_per_node=8 scripts/training/run_recipe.py \
-  --recipe wan_1_3B_pretrain_config \
+  --recipe wan_1_3b_pretrain_config \
   --step_func wan_step
 ```
 
 ### WAN 1.3B — Real data (WebDataset path):
 
 ```bash
 uv run torchrun --nproc_per_node=8 scripts/training/run_recipe.py \
-  --recipe wan_1_3B_pretrain_config \
+  --recipe wan_1_3b_pretrain_config \
   --step_func wan_step \
   dataset.path=${WORKSPACE}/datasets/wan
 ```
@@ -163,7 +163,7 @@ uv run torchrun --nproc_per_node=8 scripts/training/run_recipe.py \
 
 ```bash
 uv run torchrun --nproc_per_node=$NUM_GPUS scripts/training/run_recipe.py \
-  --recipe wan_1_3B_pretrain_config \
+  --recipe wan_1_3b_pretrain_config \
   --step_func wan_step \
   dataset.path=${WORKSPACE}/datasets/wan \
   train.global_batch_size=8 \
 
@@ -73,7 +73,7 @@ CHECKPOINT_DIR=<path/to/save/checkpoints>
 EXP_NAME=<experiment_name>
 
 NVTE_FUSED_ATTN=1 torchrun --nproc_per_node=8 scripts/training/run_recipe.py \
-    --recipe wan_1_3B_pretrain_config \
+    --recipe wan_1_3b_pretrain_config \
     --step_func wan_step \
     model.tensor_model_parallel_size=1 \
     model.pipeline_model_parallel_size=1 \
 
@@ -170,7 +170,7 @@ def parse_cli_args():
     parser.add_argument(
         "--domain",
         type=lower_str,
-        choices=["llm", "vlm", "qwen3vl"],
+        choices=["llm", "vlm", "qwen3vl", "diffusion"],
         help="Domain to use for experiment.",
         default="llm",
     )
 
@@ -0,0 +1,45 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    import megatron.bridge  # noqa: F401
+
+    HAVE_MEGATRON_BRIDGE = True
+except ModuleNotFoundError:
+    HAVE_MEGATRON_BRIDGE = False
+
+if HAVE_MEGATRON_BRIDGE:
+    from .wan_diffusion_pretrain import (
+        wan_14b_pretrain_config_gb200,
+        wan_14b_pretrain_config_h100,
+    )
+
+from .wan_workload_base_configs import (
+    WAN_14B_PRETRAIN_CONFIG_GB200_BF16_V1,
+    WAN_14B_PRETRAIN_CONFIG_H100_BF16_V1,
+)
+
+
+__all__ = [
+    "WAN_14B_PRETRAIN_CONFIG_GB200_BF16_V1",
+    "WAN_14B_PRETRAIN_CONFIG_H100_BF16_V1",
+]
+
+if HAVE_MEGATRON_BRIDGE:
+    __all__.extend(
+        [
+            "wan_14b_pretrain_config_gb200",
+            "wan_14b_pretrain_config_h100",
+        ]
+    )
@@ -0,0 +1,61 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+from utils.overrides import set_workload_base_configs
+from utils.utils import get_workload_base_config
+
+from megatron.bridge.diffusion.recipes.wan.wan import wan_14b_pretrain_config
+from megatron.bridge.training.config import ConfigContainer
+
+
+logger = logging.getLogger(__name__)
+
+
+# Wan 14B pretrain configs ---------------------------------------------------
+
+
+def wan_14b_pretrain_config_gb200(
+    precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
+) -> ConfigContainer:
+    """GB200, Wan 14B pretrain: TP=1, CP=4, GBS=64."""
+    base_cfg = get_workload_base_config(
+        model_family_name="wan",
+        model_recipe_name="wan_14b",
+        gpu="gb200",
+        compute_dtype=precision.upper(),
+        task="pretrain",
+        config_variant=config_variant,
+    )
+    cfg = wan_14b_pretrain_config()
+    set_workload_base_configs(cfg, base_cfg)
+    return cfg
+
+
+def wan_14b_pretrain_config_h100(
+    precision: str = "bf16", mock: bool = True, config_variant: str = "v1"
+) -> ConfigContainer:
+    """H100, Wan 14B pretrain: TP=2, CP=4, GBS=128, activation recompute (block/8 layers)."""
+    base_cfg = get_workload_base_config(
+        model_family_name="wan",
+        model_recipe_name="wan_14b",
+        gpu="h100",
+        compute_dtype=precision.upper(),
+        task="pretrain",
+        config_variant=config_variant,
+    )
+    cfg = wan_14b_pretrain_config()
+    set_workload_base_configs(cfg, base_cfg)
+    return cfg
@@ -0,0 +1,55 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Parallelism presets for Wan 14B performance configs.
+
+Config naming convention:
+    {MODEL}_{SIZE}_{TASK}_CONFIG_{GPU}_{PRECISION}_{VERSION}
+
+All configs use bf16 precision (diffusion training does not use fp8).
+Parallelism settings are sourced from the per-GPU YAML perf configs in
+examples/diffusion/recipes/wan/conf/.
+"""
+
+from dataclasses import replace
+
+from utils.utils import WorkloadBaseConfig
+
+
+BASE_WAN_14B_CONFIG = WorkloadBaseConfig(
+    num_gpus=8,
+    global_batch_size=64,
+    micro_batch_size=1,
+)
+
+# =============================================================================
+# Wan 14B pretrain presets
+# =============================================================================
+
+# GB200: 16 GPUs (4 nodes), TP=1, CP=4, DP=4, GBS=64
+WAN_14B_PRETRAIN_CONFIG_GB200_BF16_V1 = replace(
+    BASE_WAN_14B_CONFIG,
+    num_gpus=16,
+    tensor_model_parallel_size=1,
+    context_parallel_size=4,
+)
+
+# H100: 32 GPUs (4 nodes), TP=2, CP=4, DP=4, GBS=64, activation recompute (block/8 layers)
+WAN_14B_PRETRAIN_CONFIG_H100_BF16_V1 = replace(
+    BASE_WAN_14B_CONFIG,
+    num_gpus=32,
+    tensor_model_parallel_size=2,
+    context_parallel_size=4,
+    recompute_num_layers=8,
+)
Original file line number	Diff line number	Diff line change
`@@ -170,7 +170,7 @@ def parse_cli_args():`
`170`	`170`	`parser.add_argument(`
`171`	`171`	`"--domain",`
`172`	`172`	`type=lower_str,`
`173`		`- choices=["llm", "vlm", "qwen3vl"],`
	`173`	`+ choices=["llm", "vlm", "qwen3vl", "diffusion"],`
`174`	`174`	`help="Domain to use for experiment.",`
`175`	`175`	`default="llm",`
`176`	`176`	`)`