diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml index b2bc01eb3f..8a7e3bb6bd 100644 --- a/examples/configs/distillation_math.yaml +++ b/examples/configs/distillation_math.yaml @@ -90,6 +90,15 @@ policy: &POLICY_BASE force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 0 activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null # recompute_granularity controls activation checkpointing depth. # "full": recompute all activations (default, max memory savings). # "selective": recompute only specific modules (see recompute_modules). diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml index 944ede3bf0..2333d6b864 100644 --- a/examples/configs/distillation_math_megatron.yaml +++ b/examples/configs/distillation_math_megatron.yaml @@ -39,6 +39,15 @@ policy: &POLICY_BASE enabled: true empty_unused_memory_level: 0 activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null # recompute_granularity controls activation checkpointing depth. # "full": recompute all activations (default, max memory savings). # "selective": recompute only specific modules (see recompute_modules). diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index 4bc8623eac..5a2a6ec03b 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -129,6 +129,15 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null # recompute_granularity controls activation checkpointing depth. # "full": recompute all activations (default, max memory savings). # "selective": recompute only specific modules (see recompute_modules). diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 7a72102ce2..29807caf69 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -134,6 +134,15 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null # recompute_granularity controls activation checkpointing depth. # "full": recompute all activations (default, max memory savings). # "selective": recompute only specific modules (see recompute_modules). diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 4fdf609364..ef82d03d39 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -86,6 +86,15 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null # recompute_granularity controls activation checkpointing depth. # "full": recompute all activations (default, max memory savings). # "selective": recompute only specific modules (see recompute_modules). diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index 01fffbf2c5..1885c8e2ee 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -110,6 +110,15 @@ policy: env_vars: {} empty_unused_memory_level: 1 activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null # recompute_granularity controls activation checkpointing depth. # "full": recompute all activations (default, max memory savings). # "selective": recompute only specific modules (see recompute_modules). diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index bced18b57b..05d9bc3dc0 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -735,6 +735,24 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: "Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue." ) + # Megatron validates module names and per-model-type compatibility. + # Note: Megatron-Bridge's standalone training path also sets NUMA-aware + # CPU affinity via set_ideal_affinity_for_current_gpu() when this is on, + # which improves PCIe/DRAM throughput. NeMo-RL does not call it; users + # who need maximum offload bandwidth may want to set affinity externally. + fine_grained_activation_offloading = config["megatron_cfg"].get( + "fine_grained_activation_offloading" + ) + if fine_grained_activation_offloading: + offload_modules = config["megatron_cfg"].get("offload_modules") + if not isinstance(offload_modules, list) or not offload_modules: + raise ValueError( + "offload_modules must be a non-empty list when " + "fine_grained_activation_offloading is True." + ) + model_cfg.fine_grained_activation_offloading = True + model_cfg.offload_modules = offload_modules + def _validate_optimizer_config(config: PolicyConfig) -> None: """Validate optimizer configuration.""" diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 446d001af9..9e3f64113c 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Literal, NotRequired, TypedDict, Union +from typing import Any, Literal, NotRequired, Optional, TypedDict, Union from nemo_rl.models.generation.interfaces import GenerationConfig from nemo_rl.utils.checkpoint import PretrainedCheckpointConfig @@ -247,6 +247,19 @@ class MegatronConfig(TypedDict): moe_token_dispatcher_type: str # Can be used only with 'alltoall' token dispatcher moe_shared_expert_overlap: bool + # Offload specific module activations to CPU to reduce peak GPU memory. + # Works with both dense and MoE models. Different from + # optimizer_cpu_offload which offloads optimizer states. + # Requires transformer_engine. For TE >= 2.10.0 also requires + # NVTE_CPU_OFFLOAD_V1=1 in the environment (validated by + # Megatron-Bridge at runtime). + fine_grained_activation_offloading: NotRequired[bool] + # Modules to offload when fine_grained_activation_offloading is True. + # Required (no default). Valid values: + # "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm", + # "expert_fc1", "moe_act". Note: "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: NotRequired[Optional[list[str]]] # Enable grouped GEMM for MoE experts via CUTLASS. Significant throughput # gain when multiple experts are assigned per rank (num_local_experts > 1). # Requires TE >= 1.11.0 for FP8 and Ampere (sm_80) or newer. diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py index ff5c55c2f0..36a31c910e 100644 --- a/tests/unit/models/megatron/test_megatron_setup.py +++ b/tests/unit/models/megatron/test_megatron_setup.py @@ -1005,6 +1005,99 @@ def test_fp8_param_warning(self): with pytest.warns(UserWarning, match="fp8_param=True sometimes causes NaN"): _apply_performance_config(model_cfg, config) + def test_fine_grained_activation_offloading_enabled(self): + """Test happy path: enabled with non-empty offload_modules list.""" + from nemo_rl.models.megatron.setup import _apply_performance_config + + model_cfg = MagicMock() + model_cfg.gated_linear_unit = True + offload_modules = ["mlp", "moe_act"] + config = { + "megatron_cfg": { + "activation_checkpointing": False, + "apply_rope_fusion": False, + "bias_activation_fusion": False, + "gradient_accumulation_fusion": False, + "fine_grained_activation_offloading": True, + "offload_modules": offload_modules, + } + } + + _apply_performance_config(model_cfg, config) + + assert model_cfg.fine_grained_activation_offloading is True + assert model_cfg.offload_modules == offload_modules + + def test_fine_grained_activation_offloading_disabled_skips(self): + """When flag is False (default), no offload attrs should be set.""" + from nemo_rl.models.megatron.setup import _apply_performance_config + + model_cfg = MagicMock(spec=["gated_linear_unit"]) + model_cfg.gated_linear_unit = True + config = { + "megatron_cfg": { + "activation_checkpointing": False, + "apply_rope_fusion": False, + "bias_activation_fusion": False, + "gradient_accumulation_fusion": False, + } + } + + _apply_performance_config(model_cfg, config) + + assert not hasattr(model_cfg, "fine_grained_activation_offloading") + assert not hasattr(model_cfg, "offload_modules") + + @pytest.mark.parametrize( + "offload_modules", + [[], None, "mlp", 42], + ids=["empty_list", "none", "string", "int"], + ) + def test_fine_grained_activation_offloading_invalid_modules_raises( + self, offload_modules + ): + """offload_modules must be a non-empty list when feature is enabled.""" + from nemo_rl.models.megatron.setup import _apply_performance_config + + model_cfg = MagicMock() + model_cfg.gated_linear_unit = True + config = { + "megatron_cfg": { + "activation_checkpointing": False, + "apply_rope_fusion": False, + "bias_activation_fusion": False, + "gradient_accumulation_fusion": False, + "fine_grained_activation_offloading": True, + "offload_modules": offload_modules, + } + } + + with pytest.raises( + ValueError, match="offload_modules must be a non-empty list" + ): + _apply_performance_config(model_cfg, config) + + def test_fine_grained_activation_offloading_missing_modules_raises(self): + """When enabled but offload_modules key is absent, defaults to None → raises.""" + from nemo_rl.models.megatron.setup import _apply_performance_config + + model_cfg = MagicMock() + model_cfg.gated_linear_unit = True + config = { + "megatron_cfg": { + "activation_checkpointing": False, + "apply_rope_fusion": False, + "bias_activation_fusion": False, + "gradient_accumulation_fusion": False, + "fine_grained_activation_offloading": True, + } + } + + with pytest.raises( + ValueError, match="offload_modules must be a non-empty list" + ): + _apply_performance_config(model_cfg, config) + def test_recompute_granularity_full_explicit(self): """granularity='full' sets uniform method with 1 layer.""" from nemo_rl.models.megatron.setup import _apply_performance_config diff --git a/tests/unit/reference_configs/distillation_math.yaml b/tests/unit/reference_configs/distillation_math.yaml index 9b58e17809..c36a79451d 100644 --- a/tests/unit/reference_configs/distillation_math.yaml +++ b/tests/unit/reference_configs/distillation_math.yaml @@ -90,6 +90,8 @@ policy: &POLICY_BASE force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 0 activation_checkpointing: false + fine_grained_activation_offloading: false + offload_modules: null recompute_granularity: "full" recompute_modules: null converter_type: "Qwen3ForCausalLM" diff --git a/tests/unit/reference_configs/dpo.yaml b/tests/unit/reference_configs/dpo.yaml index 79aec40f28..f0b439adcb 100755 --- a/tests/unit/reference_configs/dpo.yaml +++ b/tests/unit/reference_configs/dpo.yaml @@ -124,6 +124,8 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 activation_checkpointing: false + fine_grained_activation_offloading: false + offload_modules: null recompute_granularity: "full" recompute_modules: null tensor_model_parallel_size: 2 diff --git a/tests/unit/reference_configs/grpo_math_1B.yaml b/tests/unit/reference_configs/grpo_math_1B.yaml index 6ce16df86d..50ad5a7270 100644 --- a/tests/unit/reference_configs/grpo_math_1B.yaml +++ b/tests/unit/reference_configs/grpo_math_1B.yaml @@ -135,6 +135,8 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. activation_checkpointing: false + fine_grained_activation_offloading: false + offload_modules: null recompute_granularity: "full" recompute_modules: null converter_type: "Qwen2ForCausalLM" diff --git a/tests/unit/reference_configs/sft.yaml b/tests/unit/reference_configs/sft.yaml index 9a508fab5f..f5397ba7b1 100644 --- a/tests/unit/reference_configs/sft.yaml +++ b/tests/unit/reference_configs/sft.yaml @@ -105,6 +105,8 @@ policy: env_vars: {} empty_unused_memory_level: 1 activation_checkpointing: false + fine_grained_activation_offloading: false + offload_modules: null recompute_granularity: "full" recompute_modules: null tensor_model_parallel_size: 1