NVIDIA-NeMo
diff --git a/‎examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml‎
Lines changed: 8 additions & 14 deletions b/‎examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml‎
Lines changed: 10 additions & 14 deletions b/‎examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml‎
Lines changed: 10 additions & 14 deletions
diff --git a/‎examples/llm_finetune/qwen/custom_qwen2_5_32b_peft_benchmark.yaml‎
Lines changed: 9 additions & 14 deletions b/‎examples/llm_finetune/qwen/custom_qwen2_5_32b_peft_benchmark.yaml‎
Lines changed: 9 additions & 14 deletions
diff --git a/‎…n/qwen2_5_32b_peft_benchmark_2nodes.yaml‎ ‎…m_qwen2_5_32b_peft_benchmark_2nodes.yaml‎examples/llm_finetune/qwen/qwen2_5_32b_peft_benchmark_2nodes.yaml renamed to examples/llm_finetune/qwen/custom_qwen2_5_32b_peft_benchmark_2nodes.yaml
Lines changed: 12 additions & 16 deletions b/‎…n/qwen2_5_32b_peft_benchmark_2nodes.yaml‎ ‎…m_qwen2_5_32b_peft_benchmark_2nodes.yaml‎examples/llm_finetune/qwen/qwen2_5_32b_peft_benchmark_2nodes.yaml renamed to examples/llm_finetune/qwen/custom_qwen2_5_32b_peft_benchmark_2nodes.yaml
Lines changed: 12 additions & 16 deletions
diff --git a/‎examples/llm_pretrain/custom_llama3_1_70b_pretrain_benchmark_8nodes.yaml‎
Lines changed: 93 additions & 0 deletions b/‎examples/llm_pretrain/custom_llama3_1_70b_pretrain_benchmark_8nodes.yaml‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎nemo_automodel/_transformers/infrastructure.py‎
Lines changed: 12 additions & 0 deletions b/‎nemo_automodel/_transformers/infrastructure.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎nemo_automodel/_transformers/utils.py‎
Lines changed: 7 additions & 5 deletions b/‎nemo_automodel/_transformers/utils.py‎
Lines changed: 7 additions & 5 deletions
@@ -8,7 +8,6 @@ recipe: TrainFinetuneRecipeForNextTokenPrediction
 
 seed: 42
 
-# NEW: Add benchmark section
 benchmark:
   warmup_steps: 5
   peak_tflops: 989  # H100: 989, A100: 312
@@ -19,7 +18,7 @@ benchmark:
 
 step_scheduler:
   global_batch_size: 32
-  local_batch_size: 4
+  local_batch_size: 2
   ckpt_every_steps: 50
   val_every_steps: 1000
   max_steps: 10
@@ -53,21 +52,17 @@ checkpoint:
 
 distributed:
   strategy: fsdp2
-  dp_size: none
+  dp_size: null
   tp_size: 2
   cp_size: 1
-  pp_size: 4
 
-  sequence_parallel: false
+  sequence_parallel: true
   activation_checkpointing: true
-
-  pipeline:
-    pp_schedule: interleaved1f1b
-    pp_microbatch_size: 1
-    layers_per_stage: 2
-    scale_grads_in_schedule: false
-    round_virtual_stages_to_pp_multiple: up
-    dtype: bf16
+  enable_async_tensor_parallel: true
+  enable_fsdp2_prefetch: true
+  enable_compile: true
+  defer_fsdp_grad_sync: false
+  patch_is_packed_sequence: true  # Patch transformers._is_packed_sequence to always return False: removes CPU-GPU sync per attention layer and ensures static shapes for torch.compile. Safe for non-packed (standard) training only.
 
 loss_fn:
   _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
@@ -82,7 +77,6 @@ dataset:
 dataloader:
   _target_: torch.utils.data.DataLoader
   batch_size: null  # Dataset already yields batches
-  # Note: model_config will be auto-injected by train_ft.py for PP models
 
 optimizer:
   _target_: torch.optim.Adam
 
@@ -19,14 +19,14 @@ benchmark:
 
 step_scheduler:
   global_batch_size: 32
-  local_batch_size: 8
+  local_batch_size: 2
   ckpt_every_steps: 50
   val_every_steps: 1000
   max_steps: 10
 
 dist_env:
   backend: nccl
-  timeout_minutes: 1
+  timeout_minutes: 10
 
 rng:
   _target_: nemo_automodel.components.training.rng.StatefulRNG
@@ -53,21 +53,18 @@ checkpoint:
 
 distributed:
   strategy: fsdp2
-  dp_size: 2
+  dp_size: null
   tp_size: 2
   cp_size: 1
-  pp_size: 4
+  pp_size: 1
 
-  sequence_parallel: false
+  sequence_parallel: true
   activation_checkpointing: true
-
-  pipeline:
-    pp_schedule: interleaved1f1b
-    pp_microbatch_size: 1
-    layers_per_stage: 2
-    scale_grads_in_schedule: false
-    round_virtual_stages_to_pp_multiple: up
-    dtype: bf16
+  enable_async_tensor_parallel: true
+  enable_fsdp2_prefetch: true
+  enable_compile: true
+  defer_fsdp_grad_sync: false
+  patch_is_packed_sequence: true  # Patch transformers._is_packed_sequence to always return False: removes CPU-GPU sync per attention layer and ensures static shapes for torch.compile. Safe for non-packed (standard) training only.
 
 loss_fn:
   _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
@@ -82,7 +79,6 @@ dataset:
 dataloader:
   _target_: torch.utils.data.DataLoader
   batch_size: null  # Dataset already yields batches
-  # Note: model_config will be auto-injected by train_ft.py for PP models
 
 optimizer:
   _target_: torch.optim.Adam
 
@@ -19,14 +19,14 @@ benchmark:
 
 step_scheduler:
   global_batch_size: 32
-  local_batch_size: 8
+  local_batch_size: 2
   ckpt_every_steps: 50
   val_every_steps: 1000
   max_steps: 10
 
 dist_env:
   backend: nccl
-  timeout_minutes: 1
+  timeout_minutes: 10
 
 rng:
   _target_: nemo_automodel.components.training.rng.StatefulRNG
@@ -53,21 +53,17 @@ checkpoint:
 
 distributed:
   strategy: fsdp2
-  dp_size: none
-  tp_size: 1
+  dp_size: null
+  tp_size: 2
   cp_size: 1
-  pp_size: 4
+  pp_size: 1
 
   sequence_parallel: false
   activation_checkpointing: true
-
-  pipeline:
-    pp_schedule: interleaved1f1b
-    pp_microbatch_size: 1
-    layers_per_stage: 2
-    scale_grads_in_schedule: false
-    round_virtual_stages_to_pp_multiple: up
-    dtype: bf16
+  enable_async_tensor_parallel: false
+  enable_fsdp2_prefetch: true
+  enable_compile: true
+  patch_is_packed_sequence: true  # Patch transformers._is_packed_sequence to always return False: removes CPU-GPU sync per attention layer and ensures static shapes for torch.compile. Safe for non-packed (standard) training only.
 
 loss_fn:
   _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
@@ -82,7 +78,6 @@ dataset:
 dataloader:
   _target_: torch.utils.data.DataLoader
   batch_size: null  # Dataset already yields batches
-  # Note: model_config will be auto-injected by train_ft.py for PP models
 
 optimizer:
   _target_: torch.optim.Adam
 
@@ -8,8 +8,8 @@
 
 recipe: TrainFinetuneRecipeForNextTokenPrediction
 
-seed: 42
-
+ seed: 42
+ 
 # Benchmark section
 benchmark:
   warmup_steps: 5
@@ -21,14 +21,14 @@ benchmark:
 
 step_scheduler:
   global_batch_size: 32
-  local_batch_size: 8
+  local_batch_size: 2
   ckpt_every_steps: 50
   val_every_steps: 1000
   max_steps: 10
 
 dist_env:
   backend: nccl
-  timeout_minutes: 1
+  timeout_minutes: 10
 
 rng:
   _target_: nemo_automodel.components.training.rng.StatefulRNG
@@ -55,21 +55,18 @@ checkpoint:
 
 distributed:
   strategy: fsdp2
-  dp_size: 4
-  tp_size: 1
+  dp_size: null
+  tp_size: 2
   cp_size: 1
-  pp_size: 4
+  pp_size: 1
 
   sequence_parallel: false
   activation_checkpointing: true
-
-  pipeline:
-    pp_schedule: interleaved1f1b
-    pp_microbatch_size: 1
-    layers_per_stage: 2
-    scale_grads_in_schedule: false
-    round_virtual_stages_to_pp_multiple: up
-    dtype: bf16
+  enable_async_tensor_parallel: false
+  enable_fsdp2_prefetch: true
+  enable_compile: true
+  defer_fsdp_grad_sync: false
+  patch_is_packed_sequence: true  # Patch transformers._is_packed_sequence to always return False: removes CPU-GPU sync per attention layer and ensures static shapes for torch.compile. Safe for non-packed (standard) training only.
 
 loss_fn:
   _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
@@ -84,7 +81,6 @@ dataset:
 dataloader:
   _target_: torch.utils.data.DataLoader
   batch_size: null  # Dataset already yields batches
-  # Note: model_config will be auto-injected by train_ft.py for PP models
 
 optimizer:
   _target_: torch.optim.Adam
 
@@ -0,0 +1,93 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# To run (from the repository root):
+#   Intended layout: 64 ranks (8 nodes x 8 GPUs; matches tp=1, cp=2, auto dp=32 in distributed.*).
+#   The interactive `automodel` CLI only launches a single node. For 8 nodes use torchrun or your
+#   cluster's multi-process wrapper, e.g.:
+#     torchrun --nnodes=8 --nproc_per_node=8 --node_rank=<rank> --master_addr=<host> --master_port=<port> \
+#       -m nemo_automodel.cli.app \
+#       examples/llm_pretrain/custom_llama3_1_70b_pretrain_benchmark_8nodes.yaml
+
+recipe: PretrainRecipeForNextTokenPrediction
+
+seed: 42
+
+benchmark:
+  warmup_steps: 5
+  peak_tflops: 989  # H100: 989, A100: 312
+  nsys_start: -1
+  nsys_end: -1
+  nsys_ranks: []
+  num_nodes: 8
+
+step_scheduler:
+  global_batch_size: 128
+  local_batch_size: 1   # dp=32: grad_acc=4 steps per rank
+  ckpt_every_steps: 2000
+  num_epochs: 1
+  max_steps: 10
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 60
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  pretrained_model_name_or_path: meta-llama/Llama-3.1-70B
+  torch_dtype: bf16
+  trust_remote_code: True
+  backend:
+    _target_: nemo_automodel.components.models.common.BackendConfig
+    rms_norm: torch_fp32
+
+checkpoint:
+  enabled: False
+
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.mock_iterable_dataset.MockIterableDataset
+  vocab_size: 100
+  seq_len: 8192
+  num_samples: 1000000
+  batch_size: 1  # Must match step_scheduler.local_batch_size
+
+dataloader:
+  _target_: torch.utils.data.DataLoader
+  batch_size: null
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+optimizer:
+  _target_: transformer_engine.pytorch.optimizers.FusedAdam
+  lr: 0.0002
+  betas: [0.9, 0.95]
+  weight_decay: 0.1
+  adam_w_mode: True
+
+distributed:
+  strategy: fsdp2
+  dp_size: null     # auto: 64 / (tp1 * cp2 * pp1) = 32
+  tp_size: 1
+  cp_size: 2
+  pp_size: 1
+  sequence_parallel: False
+  activation_checkpointing: True
+  enable_async_tensor_parallel: False
+  enable_fsdp2_prefetch: True
+  enable_compile: True
+  patch_is_packed_sequence: True  # Patch transformers._is_packed_sequence to always return False: removes CPU-GPU sync per attention layer and ensures static shapes for torch.compile. Safe for non-packed (standard) training only.
+  defer_fsdp_grad_sync: False   # Must be False with GA>1: True → delayed resharding → OOM
+  defer_rs_grad_accum: True     # GA=4: replaces 4× RS with 1× AllReduce (saves 2× RS bandwidth)
@@ -422,6 +422,7 @@ def apply_model_infrastructure(
         autopipeline=autopipeline,
         tp_size=mesh.tp_size,
         ep_size=mesh.ep_size,
+        dp_shard_size=mesh.dp_shard_size,
         pretrained_model_name_or_path=pretrained_model_name_or_path,
         load_base_model=load_base_model,
         peft_config=peft_config,
@@ -513,6 +514,17 @@ def apply_model_infrastructure(
                 load_base_model=load_base_model,
             )
 
+    # Apply per-layer torch.compile after checkpoint loading so that the _orig_mod key prefix
+    # introduced by torch.compile doesn't conflict with HF checkpoint key names.
+    if isinstance(model_wrapper, FSDP2Manager) and (
+        model_wrapper.enable_compile or model_wrapper.enable_async_tensor_parallel
+    ):
+        from nemo_automodel.components.distributed.parallelizer import _apply_per_layer_compile
+
+        model_parts = model.parts if hasattr(model, "parts") else [model]
+        for mp in model_parts:
+            _apply_per_layer_compile(mp)
+
     # Freeze parameters after checkpoint loading and parallelization
     # This catches params created during parallelization (e.g., GroupedExpertsTE in init_token_dispatcher)
     if peft_config is not None:
 
@@ -26,14 +26,15 @@ def _should_load_before_shard(
     autopipeline: Optional[object],
     tp_size: int,
     ep_size: int,
+    dp_shard_size: int = 1,
     pretrained_model_name_or_path: str,
     load_base_model: bool,
     peft_config: Optional[object],
 ) -> bool:
     """Decide whether to load the checkpoint before FSDP/TP/EP sharding.
 
-    Load-before-shard is only safe when running single-GPU (no PP, TP, or EP)
-    and a checkpoint actually needs loading.
+    Load-before-shard is only safe when running single-GPU (no PP, TP, EP, or
+    DP sharding) and a checkpoint actually needs loading.
     With any model parallelism the post-shard load path must be used to avoid
     NCCL collective mismatches or key/device inconsistencies.
 
@@ -43,12 +44,13 @@ def _should_load_before_shard(
     no_pp = autopipeline is None
     no_tp = tp_size <= 1
     no_ep = ep_size <= 1
+    no_dp_shard = dp_shard_size <= 1
     no_peft = peft_config is None
     need_checkpoint_load = bool(pretrained_model_name_or_path and load_base_model)
-    result = no_pp and no_tp and no_ep and no_peft and need_checkpoint_load
+    result = no_pp and no_tp and no_ep and no_dp_shard and no_peft and need_checkpoint_load
     logger.debug(
-        "[_should_load_before_shard] no_pp={} no_tp={} no_ep={} no_peft={} need_load={} -> {}".format(
-            no_pp, no_tp, no_ep, no_peft, need_checkpoint_load, result
+        "[_should_load_before_shard] no_pp={} no_tp={} no_ep={} no_dp_shard={} no_peft={} need_load={} -> {}".format(
+            no_pp, no_tp, no_ep, no_dp_shard, no_peft, need_checkpoint_load, result
         )
     )
     return result