From f010b596fecc5b919540eb03eede9870d89df0f3 Mon Sep 17 00:00:00 2001
From: Nic Borensztein <nborensztein@nvidia.com>
Date: Tue, 7 Apr 2026 23:10:45 -0700
Subject: [PATCH 1/2] Expose fully_parallel_save in save_megatron_model

Add fully_parallel_save parameter to AutoBridge.save_megatron_model()
and model_load_save.save_megatron_model(), forwarded to CheckpointConfig.

Defaults to True (no behavior change). Callers can pass False to disable
FullyParallelSaveStrategyWrapper, which deadlocks when the distributed
world includes ranks that do not participate in the save (e.g., vLLM
inference workers in NeMo RL non-colocated setups).

Needed by NVIDIA-NeMo/RL#2226.
---
 src/megatron/bridge/models/conversion/auto_bridge.py | 6 ++++++
 src/megatron/bridge/training/model_load_save.py      | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/src/megatron/bridge/models/conversion/auto_bridge.py b/src/megatron/bridge/models/conversion/auto_bridge.py
index e1df1c5b97..9749f2a5a2 100644
--- a/src/megatron/bridge/models/conversion/auto_bridge.py
+++ b/src/megatron/bridge/models/conversion/auto_bridge.py
@@ -884,6 +884,7 @@ def save_megatron_model(
         hf_tokenizer_path: Optional[str | Path] = None,
         low_memory_save: bool = False,
         hf_tokenizer_kwargs: Optional[dict] = None,
+        fully_parallel_save: bool = True,
     ) -> None:
         """
         Save a Megatron model in native Megatron checkpoint format without optimizer
@@ -907,6 +908,10 @@ def save_megatron_model(
             hf_tokenizer_kwargs: Optional dictionary of kwargs to pass to the HuggingFace tokenizer.
                 Common options include trust_remote_code=True for models with custom tokenizers,
                 or use_fast=True for models that require the fast tokenizer.
+            fully_parallel_save: If True (default), uses fully parallel save strategy which
+                requires all DP ranks to participate in collective operations. Set to False
+                when saving from contexts where not all ranks will enter the save path
+                (e.g., mixed training/inference worlds with non-colocated vLLM).
 
         Example:
             >>> # Save model checkpoint after conversion
@@ -942,6 +947,7 @@ def save_megatron_model(
             hf_tokenizer_path=hf_tokenizer_path,
             low_memory_save=low_memory_save,
             hf_tokenizer_kwargs=hf_tokenizer_kwargs,
+            fully_parallel_save=fully_parallel_save,
         )
 
     def load_megatron_model(
diff --git a/src/megatron/bridge/training/model_load_save.py b/src/megatron/bridge/training/model_load_save.py
index 5265ac7792..8b2262f3fe 100644
--- a/src/megatron/bridge/training/model_load_save.py
+++ b/src/megatron/bridge/training/model_load_save.py
@@ -446,6 +446,7 @@ def save_megatron_model(
     hf_tokenizer_path: Optional[Union[str, Path]] = None,
     low_memory_save: bool = False,
     hf_tokenizer_kwargs: Optional[dict] = None,
+    fully_parallel_save: bool = True,
 ) -> None:
     """Save a Megatron model in native Megatron checkpoint format without optimizer state.
 
@@ -472,6 +473,10 @@ def save_megatron_model(
             Default is False, preserving the model for further use.
         hf_tokenizer_kwargs: Optional dictionary of kwargs to pass to the HuggingFace tokenizer.
             Common options include trust_remote_code=True for models with custom tokenizers.
+        fully_parallel_save: If True (default), uses fully parallel save strategy which
+            requires all DP ranks to participate in collective operations. Set to False
+            when saving from contexts where not all ranks will enter the save path
+            (e.g., mixed training/inference worlds with non-colocated vLLM).
 
     Example:
         >>> # Save model checkpoint
@@ -538,6 +543,7 @@ def save_megatron_model(
             save_rng=False,
             ckpt_format=ckpt_format,
             dist_ckpt_optim_fully_reshardable=True,
+            fully_parallel_save=fully_parallel_save,
         ),
         dist=None,
     )

From 664a8a85bc7f72814dad0ddea96acdabe5c38b4c Mon Sep 17 00:00:00 2001
From: Nic Borensztein <nborensztein@nvidia.com>
Date: Wed, 8 Apr 2026 10:10:17 -0700
Subject: [PATCH 2/2] Add validate_access_integrity and
 distributed_timeout_minutes to save_megatron_model

The all_gather_object in determine_global_metadata (validation.py:518) uses
the default PG. When some ranks take longer to build their state dict (e.g.,
due to expert parallelism), the collective times out. For one-time conversion
saves, validation is unnecessary and can be safely skipped.

Also adds distributed_timeout_minutes for callers that need longer timeouts
during large model saves.
---
 src/megatron/bridge/models/conversion/auto_bridge.py | 2 ++
 src/megatron/bridge/training/model_load_save.py      | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/src/megatron/bridge/models/conversion/auto_bridge.py b/src/megatron/bridge/models/conversion/auto_bridge.py
index 9749f2a5a2..8b215e40e8 100644
--- a/src/megatron/bridge/models/conversion/auto_bridge.py
+++ b/src/megatron/bridge/models/conversion/auto_bridge.py
@@ -885,6 +885,7 @@ def save_megatron_model(
         low_memory_save: bool = False,
         hf_tokenizer_kwargs: Optional[dict] = None,
         fully_parallel_save: bool = True,
+        validate_access_integrity: bool = True,
     ) -> None:
         """
         Save a Megatron model in native Megatron checkpoint format without optimizer
@@ -948,6 +949,7 @@ def save_megatron_model(
             low_memory_save=low_memory_save,
             hf_tokenizer_kwargs=hf_tokenizer_kwargs,
             fully_parallel_save=fully_parallel_save,
+            validate_access_integrity=validate_access_integrity,
         )
 
     def load_megatron_model(
diff --git a/src/megatron/bridge/training/model_load_save.py b/src/megatron/bridge/training/model_load_save.py
index 8b2262f3fe..9ff7629a18 100644
--- a/src/megatron/bridge/training/model_load_save.py
+++ b/src/megatron/bridge/training/model_load_save.py
@@ -447,6 +447,8 @@ def save_megatron_model(
     low_memory_save: bool = False,
     hf_tokenizer_kwargs: Optional[dict] = None,
     fully_parallel_save: bool = True,
+    validate_access_integrity: bool = True,
+    distributed_timeout_minutes: int = 10,
 ) -> None:
     """Save a Megatron model in native Megatron checkpoint format without optimizer state.
 
@@ -544,6 +546,7 @@ def save_megatron_model(
             ckpt_format=ckpt_format,
             dist_ckpt_optim_fully_reshardable=True,
             fully_parallel_save=fully_parallel_save,
+            ckpt_assume_constant_structure=not validate_access_integrity,
         ),
         dist=None,
     )