From f010b596fecc5b919540eb03eede9870d89df0f3 Mon Sep 17 00:00:00 2001 From: Nic Borensztein Date: Tue, 7 Apr 2026 23:10:45 -0700 Subject: [PATCH 1/2] Expose fully_parallel_save in save_megatron_model Add fully_parallel_save parameter to AutoBridge.save_megatron_model() and model_load_save.save_megatron_model(), forwarded to CheckpointConfig. Defaults to True (no behavior change). Callers can pass False to disable FullyParallelSaveStrategyWrapper, which deadlocks when the distributed world includes ranks that do not participate in the save (e.g., vLLM inference workers in NeMo RL non-colocated setups). Needed by NVIDIA-NeMo/RL#2226. --- src/megatron/bridge/models/conversion/auto_bridge.py | 6 ++++++ src/megatron/bridge/training/model_load_save.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/megatron/bridge/models/conversion/auto_bridge.py b/src/megatron/bridge/models/conversion/auto_bridge.py index e1df1c5b97..9749f2a5a2 100644 --- a/src/megatron/bridge/models/conversion/auto_bridge.py +++ b/src/megatron/bridge/models/conversion/auto_bridge.py @@ -884,6 +884,7 @@ def save_megatron_model( hf_tokenizer_path: Optional[str | Path] = None, low_memory_save: bool = False, hf_tokenizer_kwargs: Optional[dict] = None, + fully_parallel_save: bool = True, ) -> None: """ Save a Megatron model in native Megatron checkpoint format without optimizer @@ -907,6 +908,10 @@ def save_megatron_model( hf_tokenizer_kwargs: Optional dictionary of kwargs to pass to the HuggingFace tokenizer. Common options include trust_remote_code=True for models with custom tokenizers, or use_fast=True for models that require the fast tokenizer. + fully_parallel_save: If True (default), uses fully parallel save strategy which + requires all DP ranks to participate in collective operations. Set to False + when saving from contexts where not all ranks will enter the save path + (e.g., mixed training/inference worlds with non-colocated vLLM). Example: >>> # Save model checkpoint after conversion @@ -942,6 +947,7 @@ def save_megatron_model( hf_tokenizer_path=hf_tokenizer_path, low_memory_save=low_memory_save, hf_tokenizer_kwargs=hf_tokenizer_kwargs, + fully_parallel_save=fully_parallel_save, ) def load_megatron_model( diff --git a/src/megatron/bridge/training/model_load_save.py b/src/megatron/bridge/training/model_load_save.py index 5265ac7792..8b2262f3fe 100644 --- a/src/megatron/bridge/training/model_load_save.py +++ b/src/megatron/bridge/training/model_load_save.py @@ -446,6 +446,7 @@ def save_megatron_model( hf_tokenizer_path: Optional[Union[str, Path]] = None, low_memory_save: bool = False, hf_tokenizer_kwargs: Optional[dict] = None, + fully_parallel_save: bool = True, ) -> None: """Save a Megatron model in native Megatron checkpoint format without optimizer state. @@ -472,6 +473,10 @@ def save_megatron_model( Default is False, preserving the model for further use. hf_tokenizer_kwargs: Optional dictionary of kwargs to pass to the HuggingFace tokenizer. Common options include trust_remote_code=True for models with custom tokenizers. + fully_parallel_save: If True (default), uses fully parallel save strategy which + requires all DP ranks to participate in collective operations. Set to False + when saving from contexts where not all ranks will enter the save path + (e.g., mixed training/inference worlds with non-colocated vLLM). Example: >>> # Save model checkpoint @@ -538,6 +543,7 @@ def save_megatron_model( save_rng=False, ckpt_format=ckpt_format, dist_ckpt_optim_fully_reshardable=True, + fully_parallel_save=fully_parallel_save, ), dist=None, ) From 664a8a85bc7f72814dad0ddea96acdabe5c38b4c Mon Sep 17 00:00:00 2001 From: Nic Borensztein Date: Wed, 8 Apr 2026 10:10:17 -0700 Subject: [PATCH 2/2] Add validate_access_integrity and distributed_timeout_minutes to save_megatron_model The all_gather_object in determine_global_metadata (validation.py:518) uses the default PG. When some ranks take longer to build their state dict (e.g., due to expert parallelism), the collective times out. For one-time conversion saves, validation is unnecessary and can be safely skipped. Also adds distributed_timeout_minutes for callers that need longer timeouts during large model saves. --- src/megatron/bridge/models/conversion/auto_bridge.py | 2 ++ src/megatron/bridge/training/model_load_save.py | 3 +++ 2 files changed, 5 insertions(+) diff --git a/src/megatron/bridge/models/conversion/auto_bridge.py b/src/megatron/bridge/models/conversion/auto_bridge.py index 9749f2a5a2..8b215e40e8 100644 --- a/src/megatron/bridge/models/conversion/auto_bridge.py +++ b/src/megatron/bridge/models/conversion/auto_bridge.py @@ -885,6 +885,7 @@ def save_megatron_model( low_memory_save: bool = False, hf_tokenizer_kwargs: Optional[dict] = None, fully_parallel_save: bool = True, + validate_access_integrity: bool = True, ) -> None: """ Save a Megatron model in native Megatron checkpoint format without optimizer @@ -948,6 +949,7 @@ def save_megatron_model( low_memory_save=low_memory_save, hf_tokenizer_kwargs=hf_tokenizer_kwargs, fully_parallel_save=fully_parallel_save, + validate_access_integrity=validate_access_integrity, ) def load_megatron_model( diff --git a/src/megatron/bridge/training/model_load_save.py b/src/megatron/bridge/training/model_load_save.py index 8b2262f3fe..9ff7629a18 100644 --- a/src/megatron/bridge/training/model_load_save.py +++ b/src/megatron/bridge/training/model_load_save.py @@ -447,6 +447,8 @@ def save_megatron_model( low_memory_save: bool = False, hf_tokenizer_kwargs: Optional[dict] = None, fully_parallel_save: bool = True, + validate_access_integrity: bool = True, + distributed_timeout_minutes: int = 10, ) -> None: """Save a Megatron model in native Megatron checkpoint format without optimizer state. @@ -544,6 +546,7 @@ def save_megatron_model( ckpt_format=ckpt_format, dist_ckpt_optim_fully_reshardable=True, fully_parallel_save=fully_parallel_save, + ckpt_assume_constant_structure=not validate_access_integrity, ), dist=None, )