NVIDIA-NeMo · seonjinn · Apr 17, 2026 · Apr 23, 2026 · Apr 23, 2026 · May 14, 2026
@@ -90,6 +90,15 @@ policy: &POLICY_BASE
         force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
         empty_unused_memory_level: 0
         activation_checkpointing: false
+        # Offload specific module activations to CPU. Works for both dense and MoE
+        # models. Requires transformer_engine. Different from optimizer_cpu_offload
+        # which offloads optimizer states.
+        fine_grained_activation_offloading: false
+        # Modules to offload when fine_grained_activation_offloading is true.
+        # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+        # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+        # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+        offload_modules: null
         # recompute_granularity controls activation checkpointing depth.
         # "full": recompute all activations (default, max memory savings).
         # "selective": recompute only specific modules (see recompute_modules).

@@ -39,6 +39,15 @@ policy: &POLICY_BASE
         enabled: true
         empty_unused_memory_level: 0
         activation_checkpointing: false
+        # Offload specific module activations to CPU. Works for both dense and MoE
+        # models. Requires transformer_engine. Different from optimizer_cpu_offload
+        # which offloads optimizer states.
+        fine_grained_activation_offloading: false
+        # Modules to offload when fine_grained_activation_offloading is true.
+        # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+        # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+        # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+        offload_modules: null
         # recompute_granularity controls activation checkpointing depth.
         # "full": recompute all activations (default, max memory savings).
         # "selective": recompute only specific modules (see recompute_modules).

@@ -129,6 +129,15 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1
     activation_checkpointing: false
+    # Offload specific module activations to CPU. Works for both dense and MoE
+    # models. Requires transformer_engine. Different from optimizer_cpu_offload
+    # which offloads optimizer states.
+    fine_grained_activation_offloading: false
+    # Modules to offload when fine_grained_activation_offloading is true.
+    # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+    # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: null
     # recompute_granularity controls activation checkpointing depth.
     # "full": recompute all activations (default, max memory savings).
     # "selective": recompute only specific modules (see recompute_modules).

@@ -134,6 +134,15 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     activation_checkpointing: false
+    # Offload specific module activations to CPU. Works for both dense and MoE
+    # models. Requires transformer_engine. Different from optimizer_cpu_offload
+    # which offloads optimizer states.
+    fine_grained_activation_offloading: false
+    # Modules to offload when fine_grained_activation_offloading is true.
+    # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+    # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: null
     # recompute_granularity controls activation checkpointing depth.
     # "full": recompute all activations (default, max memory savings).
     # "selective": recompute only specific modules (see recompute_modules).

@@ -86,6 +86,15 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     activation_checkpointing: false
+    # Offload specific module activations to CPU. Works for both dense and MoE
+    # models. Requires transformer_engine. Different from optimizer_cpu_offload
+    # which offloads optimizer states.
+    fine_grained_activation_offloading: false
+    # Modules to offload when fine_grained_activation_offloading is true.
+    # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+    # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: null
     # recompute_granularity controls activation checkpointing depth.
     # "full": recompute all activations (default, max memory savings).
     # "selective": recompute only specific modules (see recompute_modules).

@@ -110,6 +110,15 @@ policy:
     env_vars: {}
     empty_unused_memory_level: 1
     activation_checkpointing: false
+    # Offload specific module activations to CPU. Works for both dense and MoE
+    # models. Requires transformer_engine. Different from optimizer_cpu_offload
+    # which offloads optimizer states.
+    fine_grained_activation_offloading: false
+    # Modules to offload when fine_grained_activation_offloading is true.
+    # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+    # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: null
     # recompute_granularity controls activation checkpointing depth.
     # "full": recompute all activations (default, max memory savings).
     # "selective": recompute only specific modules (see recompute_modules).

@@ -735,6 +735,24 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
                 "Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue."
             )
 
+    # Megatron validates module names and per-model-type compatibility.
+    # Note: Megatron-Bridge's standalone training path also sets NUMA-aware
+    # CPU affinity via set_ideal_affinity_for_current_gpu() when this is on,
+    # which improves PCIe/DRAM throughput. NeMo-RL does not call it; users
+    # who need maximum offload bandwidth may want to set affinity externally.
+    fine_grained_activation_offloading = config["megatron_cfg"].get(
+        "fine_grained_activation_offloading"
+    )
+    if fine_grained_activation_offloading:
+        offload_modules = config["megatron_cfg"].get("offload_modules")
+        if not isinstance(offload_modules, list) or not offload_modules:
+            raise ValueError(
+                "offload_modules must be a non-empty list when "
+                "fine_grained_activation_offloading is True."
+            )
+        model_cfg.fine_grained_activation_offloading = True
+        model_cfg.offload_modules = offload_modules
+
 
 def _validate_optimizer_config(config: PolicyConfig) -> None:
     """Validate optimizer configuration."""

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Literal, NotRequired, TypedDict, Union
+from typing import Any, Literal, NotRequired, Optional, TypedDict, Union
 
 from nemo_rl.models.generation.interfaces import GenerationConfig
 from nemo_rl.utils.checkpoint import PretrainedCheckpointConfig
@@ -247,6 +247,19 @@ class MegatronConfig(TypedDict):
     moe_token_dispatcher_type: str
     # Can be used only with 'alltoall' token dispatcher
     moe_shared_expert_overlap: bool
+    # Offload specific module activations to CPU to reduce peak GPU memory.
+    # Works with both dense and MoE models. Different from
+    # optimizer_cpu_offload which offloads optimizer states.
+    # Requires transformer_engine. For TE >= 2.10.0 also requires
+    # NVTE_CPU_OFFLOAD_V1=1 in the environment (validated by
+    # Megatron-Bridge at runtime).
+    fine_grained_activation_offloading: NotRequired[bool]
+    # Modules to offload when fine_grained_activation_offloading is True.
+    # Required (no default). Valid values:
+    # "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm",
+    # "expert_fc1", "moe_act". Note: "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: NotRequired[Optional[list[str]]]
     # Enable grouped GEMM for MoE experts via CUTLASS. Significant throughput
     # gain when multiple experts are assigned per rank (num_local_experts > 1).
     # Requires TE >= 1.11.0 for FP8 and Ampere (sm_80) or newer.

@@ -1005,6 +1005,99 @@ def test_fp8_param_warning(self):
         with pytest.warns(UserWarning, match="fp8_param=True sometimes causes NaN"):
             _apply_performance_config(model_cfg, config)
 
+    def test_fine_grained_activation_offloading_enabled(self):
+        """Test happy path: enabled with non-empty offload_modules list."""
+        from nemo_rl.models.megatron.setup import _apply_performance_config
+
+        model_cfg = MagicMock()
+        model_cfg.gated_linear_unit = True
+        offload_modules = ["mlp", "moe_act"]
-        offload_modules = ["mlp", "moe_act"]
+        offload_modules = ["mlp_norm", "moe_act"]
-        offload_modules = ["mlp", "moe_act"]
+        offload_modules = ["mlp_norm", "moe_act"]
+        config = {
+            "megatron_cfg": {
+                "activation_checkpointing": False,
+                "apply_rope_fusion": False,
+                "bias_activation_fusion": False,
+                "gradient_accumulation_fusion": False,
+                "fine_grained_activation_offloading": True,
+                "offload_modules": offload_modules,
+            }
+        }
+
+        _apply_performance_config(model_cfg, config)
+
+        assert model_cfg.fine_grained_activation_offloading is True
+        assert model_cfg.offload_modules == offload_modules
+
+    def test_fine_grained_activation_offloading_disabled_skips(self):
+        """When flag is False (default), no offload attrs should be set."""
+        from nemo_rl.models.megatron.setup import _apply_performance_config
+
+        model_cfg = MagicMock(spec=["gated_linear_unit"])
+        model_cfg.gated_linear_unit = True
+        config = {
+            "megatron_cfg": {
+                "activation_checkpointing": False,
+                "apply_rope_fusion": False,
+                "bias_activation_fusion": False,
+                "gradient_accumulation_fusion": False,
+            }
+        }
+
+        _apply_performance_config(model_cfg, config)
+
+        assert not hasattr(model_cfg, "fine_grained_activation_offloading")
+        assert not hasattr(model_cfg, "offload_modules")
+
+    @pytest.mark.parametrize(
+        "offload_modules",
+        [[], None, "mlp", 42],
+        ids=["empty_list", "none", "string", "int"],
+    )
+    def test_fine_grained_activation_offloading_invalid_modules_raises(
+        self, offload_modules
+    ):
+        """offload_modules must be a non-empty list when feature is enabled."""
+        from nemo_rl.models.megatron.setup import _apply_performance_config
+
+        model_cfg = MagicMock()
+        model_cfg.gated_linear_unit = True
+        config = {
+            "megatron_cfg": {
+                "activation_checkpointing": False,
+                "apply_rope_fusion": False,
+                "bias_activation_fusion": False,
+                "gradient_accumulation_fusion": False,
+                "fine_grained_activation_offloading": True,
+                "offload_modules": offload_modules,
+            }
+        }
+
+        with pytest.raises(
+            ValueError, match="offload_modules must be a non-empty list"
+        ):
+            _apply_performance_config(model_cfg, config)
+
+    def test_fine_grained_activation_offloading_missing_modules_raises(self):
+        """When enabled but offload_modules key is absent, defaults to None → raises."""
+        from nemo_rl.models.megatron.setup import _apply_performance_config
+
+        model_cfg = MagicMock()
+        model_cfg.gated_linear_unit = True
+        config = {
-        config = {
+        """When enabled but offload_modules key is absent, defaults to None → raises."""
-        config = {
+        """When enabled but offload_modules key is absent, defaults to None → raises."""
+            "megatron_cfg": {
+                "activation_checkpointing": False,
+                "apply_rope_fusion": False,
+                "bias_activation_fusion": False,
+                "gradient_accumulation_fusion": False,
+                "fine_grained_activation_offloading": True,
+            }
+        }
+
+        with pytest.raises(
+            ValueError, match="offload_modules must be a non-empty list"
+        ):
+            _apply_performance_config(model_cfg, config)
+
     def test_recompute_granularity_full_explicit(self):
         """granularity='full' sets uniform method with 1 layer."""
         from nemo_rl.models.megatron.setup import _apply_performance_config

@@ -90,6 +90,8 @@ policy: &POLICY_BASE
         force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
         empty_unused_memory_level: 0
         activation_checkpointing: false
+        fine_grained_activation_offloading: false
+        offload_modules: null
         recompute_granularity: "full"
         recompute_modules: null
         converter_type: "Qwen3ForCausalLM"

@@ -124,6 +124,8 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1
     activation_checkpointing: false
+    fine_grained_activation_offloading: false
+    offload_modules: null
     recompute_granularity: "full"
     recompute_modules: null
     tensor_model_parallel_size: 2

@@ -135,6 +135,8 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     activation_checkpointing: false
+    fine_grained_activation_offloading: false
+    offload_modules: null
     recompute_granularity: "full"
     recompute_modules: null
     converter_type: "Qwen2ForCausalLM"

@@ -105,6 +105,8 @@ policy:
     env_vars: {}
     empty_unused_memory_level: 1
     activation_checkpointing: false
+    fine_grained_activation_offloading: false
+    offload_modules: null
     recompute_granularity: "full"
     recompute_modules: null
     tensor_model_parallel_size: 1