From feee53ef4e35418b352a7aaefcdcf21ed35e48cd Mon Sep 17 00:00:00 2001
From: sna <sna@nvidia.com>
Date: Fri, 17 Apr 2026 00:08:44 -0700
Subject: [PATCH 01/13] Add fine-grained activation offloading for Megatron
 policy

Exposes Megatron-Core fine_grained_activation_offloading and
offload_modules through PolicyConfig so training can offload specific
submodule activations (moe_act, core_attn, qkv_linear, mlp_norm,
attn_norm) to CPU. Works for both dense and MoE models. Validation of
module names is left to Megatron.

Signed-off-by: sna <sna@nvidia.com>
---
 nemo_rl/models/megatron/setup.py  | 18 ++++++++++++++++++
 nemo_rl/models/policy/__init__.py |  8 ++++++++
 2 files changed, 26 insertions(+)

diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index fc5c6c44fa..4bdc1eb0a2 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -514,6 +514,24 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
                 "Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue."
             )
 
+    # Fine-grained activation offloading moves specified submodule activations
+    # to CPU. Works for both dense and MoE models; the user picks which
+    # submodules to offload via offload_modules. Megatron owns the list of
+    # valid module names and their per-model-type compatibility, so we only
+    # require a non-empty list here and let Megatron validate the contents.
+    fine_grained_activation_offloading = config["megatron_cfg"].get(
+        "fine_grained_activation_offloading", False
+    )
+    if fine_grained_activation_offloading:
+        offload_modules = config["megatron_cfg"].get("offload_modules", [])
+        if not offload_modules:
+            raise ValueError(
+                "offload_modules must be a non-empty list when "
+                "fine_grained_activation_offloading is True."
+            )
+        model_cfg.fine_grained_activation_offloading = True
+        model_cfg.offload_modules = offload_modules
+
 
 def _validate_optimizer_config(config: PolicyConfig) -> None:
     """Validate optimizer configuration."""
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index ec4c9e66bb..28d9a8bbc7 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -236,6 +236,14 @@ class MegatronConfig(TypedDict):
     moe_token_dispatcher_type: str
     # Can be used only with 'alltoall' token dispatcher
     moe_shared_expert_overlap: bool
+    # Offload specific module activations to CPU to reduce peak GPU memory.
+    # Works with MoE models (offloads MoE expert activations). Different from
+    # optimizer_cpu_offload which offloads optimizer states.
+    fine_grained_activation_offloading: NotRequired[bool]
+    # Modules to offload when fine_grained_activation_offloading is True.
+    # Defaults to ["moe_act"] if not specified. Valid values include:
+    # "moe_act", "core_attn", "qkv_linear", "mlp_norm", "attn_norm".
+    offload_modules: NotRequired[list[str]]
     peft: NotRequired[MegatronPeftConfig | MegatronPeftConfigDisabled]
     optimizer: MegatronOptimizerConfig
     scheduler: MegatronSchedulerConfig

From 0b530c4d4dcff294584b963fd453f2142d1b5c3f Mon Sep 17 00:00:00 2001
From: Seonjin <sna@nvidia.com>
Date: Thu, 23 Apr 2026 14:31:47 -0700
Subject: [PATCH 02/13] Update nemo_rl/models/policy/__init__.py

Co-authored-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Seonjin <sna@nvidia.com>
---
 nemo_rl/models/policy/__init__.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index 28d9a8bbc7..fa59cd94e6 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -237,12 +237,15 @@ class MegatronConfig(TypedDict):
     # Can be used only with 'alltoall' token dispatcher
     moe_shared_expert_overlap: bool
     # Offload specific module activations to CPU to reduce peak GPU memory.
-    # Works with MoE models (offloads MoE expert activations). Different from
+    # Works with both dense and MoE models. Different from
     # optimizer_cpu_offload which offloads optimizer states.
+    # Requires transformer_engine implementation.
     fine_grained_activation_offloading: NotRequired[bool]
     # Modules to offload when fine_grained_activation_offloading is True.
-    # Defaults to ["moe_act"] if not specified. Valid values include:
-    # "moe_act", "core_attn", "qkv_linear", "mlp_norm", "attn_norm".
+    # Required (no default). Valid values:
+    # "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm",
+    # "expert_fc1", "moe_act". Note: "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
     offload_modules: NotRequired[list[str]]
     peft: NotRequired[MegatronPeftConfig | MegatronPeftConfigDisabled]
     optimizer: MegatronOptimizerConfig

From d5df80afa833702a1a96f808275463d70e43d427 Mon Sep 17 00:00:00 2001
From: Seonjin <sna@nvidia.com>
Date: Thu, 23 Apr 2026 14:32:14 -0700
Subject: [PATCH 03/13] Update nemo_rl/models/megatron/setup.py

Co-authored-by: Terry Kong <terryk@nvidia.com>
Signed-off-by: Seonjin <sna@nvidia.com>
---
 nemo_rl/models/megatron/setup.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 4bdc1eb0a2..caef9c10d9 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -524,11 +524,12 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
     )
     if fine_grained_activation_offloading:
         offload_modules = config["megatron_cfg"].get("offload_modules", [])
-        if not offload_modules:
+        if not isinstance(offload_modules, list) or not offload_modules:
             raise ValueError(
                 "offload_modules must be a non-empty list when "
                 "fine_grained_activation_offloading is True."
             )
+            )
         model_cfg.fine_grained_activation_offloading = True
         model_cfg.offload_modules = offload_modules
 

From e23798739aace8e26989726f05e4902d78ea16b4 Mon Sep 17 00:00:00 2001
From: sna <sna@nvidia.com>
Date: Thu, 14 May 2026 14:16:44 -0700
Subject: [PATCH 04/13] fix: remove stray paren in setup.py raising ValueError

A stray closing parenthesis after the raise ValueError block caused a
SyntaxError, blocking the ruff/ruff-format pre-commit hooks in CI.

Signed-off-by: sna <sna@nvidia.com>
---
 nemo_rl/models/megatron/setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 685e8aa3e7..841ac8763b 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -687,7 +687,6 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
                 "offload_modules must be a non-empty list when "
                 "fine_grained_activation_offloading is True."
             )
-            )
         model_cfg.fine_grained_activation_offloading = True
         model_cfg.offload_modules = offload_modules
 

From 06b4d4abcfd3eebe1356827bfeccba66c7bed8c4 Mon Sep 17 00:00:00 2001
From: sna <sna@nvidia.com>
Date: Fri, 15 May 2026 11:52:45 -0700
Subject: [PATCH 05/13] fix: pin NeMo Gym docs URL to v0.2.1 (latest 404)

Signed-off-by: sna <sna@nvidia.com>
---
 docs/design-docs/nemo-gym-integration.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/design-docs/nemo-gym-integration.md b/docs/design-docs/nemo-gym-integration.md
index 33e324547b..0263d36fef 100644
--- a/docs/design-docs/nemo-gym-integration.md
+++ b/docs/design-docs/nemo-gym-integration.md
@@ -181,7 +181,7 @@ sequenceDiagram
     GRPO->>Policy: Compute loss and train
 ```
 
-> **NeMo Gym server types** (see [Core Components](https://docs.nvidia.com/nemo/gym/latest/about/concepts/core-components.html)):
+> **NeMo Gym server types** (see [Core Components](https://docs.nvidia.com/nemo/gym/v0.2.1/about/concepts/core-components/)):
 > - **Agent Server**: Orchestrates the rollout loop
 > - **Model Server**: HTTP proxy to vLLM; translates Responses API ↔ Chat Completions
 > - **Resource Server**: Provides tools and rewards

From 522521770628acb6ff9d31407d8c92fdb825261b Mon Sep 17 00:00:00 2001
From: sna <sna@nvidia.com>
Date: Sat, 16 May 2026 15:23:14 -0700
Subject: [PATCH 06/13] test: add unit tests for
 fine_grained_activation_offloading branch

Covers _apply_performance_config offload-modules dispatch:
- happy path: True + non-empty list sets both attrs
- disabled: defaults skip the branch (no attrs touched)
- invalid offload_modules ([], None, str, int) all raise ValueError
- missing offload_modules key raises ValueError

Lifts patch coverage above codecov target.

Signed-off-by: sna <sna@nvidia.com>
---
 .../models/megatron/test_megatron_setup.py    | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py
index 1eaa3a1247..c2dce64c31 100644
--- a/tests/unit/models/megatron/test_megatron_setup.py
+++ b/tests/unit/models/megatron/test_megatron_setup.py
@@ -794,6 +794,99 @@ def test_fp8_param_warning(self):
         with pytest.warns(UserWarning, match="fp8_param=True sometimes causes NaN"):
             _apply_performance_config(model_cfg, config)
 
+    def test_fine_grained_activation_offloading_enabled(self):
+        """Test happy path: enabled with non-empty offload_modules list."""
+        from nemo_rl.models.megatron.setup import _apply_performance_config
+
+        model_cfg = MagicMock()
+        model_cfg.gated_linear_unit = True
+        offload_modules = ["mlp", "moe_act"]
+        config = {
+            "megatron_cfg": {
+                "activation_checkpointing": False,
+                "apply_rope_fusion": False,
+                "bias_activation_fusion": False,
+                "gradient_accumulation_fusion": False,
+                "fine_grained_activation_offloading": True,
+                "offload_modules": offload_modules,
+            }
+        }
+
+        _apply_performance_config(model_cfg, config)
+
+        assert model_cfg.fine_grained_activation_offloading is True
+        assert model_cfg.offload_modules == offload_modules
+
+    def test_fine_grained_activation_offloading_disabled_skips(self):
+        """When flag is False (default), no offload attrs should be set."""
+        from nemo_rl.models.megatron.setup import _apply_performance_config
+
+        model_cfg = MagicMock(spec=["gated_linear_unit"])
+        model_cfg.gated_linear_unit = True
+        config = {
+            "megatron_cfg": {
+                "activation_checkpointing": False,
+                "apply_rope_fusion": False,
+                "bias_activation_fusion": False,
+                "gradient_accumulation_fusion": False,
+            }
+        }
+
+        _apply_performance_config(model_cfg, config)
+
+        assert not hasattr(model_cfg, "fine_grained_activation_offloading")
+        assert not hasattr(model_cfg, "offload_modules")
+
+    @pytest.mark.parametrize(
+        "offload_modules",
+        [[], None, "mlp", 42],
+        ids=["empty_list", "none", "string", "int"],
+    )
+    def test_fine_grained_activation_offloading_invalid_modules_raises(
+        self, offload_modules
+    ):
+        """offload_modules must be a non-empty list when feature is enabled."""
+        from nemo_rl.models.megatron.setup import _apply_performance_config
+
+        model_cfg = MagicMock()
+        model_cfg.gated_linear_unit = True
+        config = {
+            "megatron_cfg": {
+                "activation_checkpointing": False,
+                "apply_rope_fusion": False,
+                "bias_activation_fusion": False,
+                "gradient_accumulation_fusion": False,
+                "fine_grained_activation_offloading": True,
+                "offload_modules": offload_modules,
+            }
+        }
+
+        with pytest.raises(
+            ValueError, match="offload_modules must be a non-empty list"
+        ):
+            _apply_performance_config(model_cfg, config)
+
+    def test_fine_grained_activation_offloading_missing_modules_raises(self):
+        """When enabled but offload_modules key is absent, defaults to [] → raises."""
+        from nemo_rl.models.megatron.setup import _apply_performance_config
+
+        model_cfg = MagicMock()
+        model_cfg.gated_linear_unit = True
+        config = {
+            "megatron_cfg": {
+                "activation_checkpointing": False,
+                "apply_rope_fusion": False,
+                "bias_activation_fusion": False,
+                "gradient_accumulation_fusion": False,
+                "fine_grained_activation_offloading": True,
+            }
+        }
+
+        with pytest.raises(
+            ValueError, match="offload_modules must be a non-empty list"
+        ):
+            _apply_performance_config(model_cfg, config)
+
 
 @pytest.mark.mcore
 class TestValidateOptimizerConfig:

From 211e31a37dc576837437582842774ad7b30ee172 Mon Sep 17 00:00:00 2001
From: Seonjin <sna@nvidia.com>
Date: Mon, 18 May 2026 22:14:36 -0700
Subject: [PATCH 07/13] Update nemo_rl/models/megatron/setup.py

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>
Signed-off-by: Seonjin <sna@nvidia.com>
---
 nemo_rl/models/megatron/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 841ac8763b..1857185048 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -681,7 +681,7 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
         "fine_grained_activation_offloading", False
     )
     if fine_grained_activation_offloading:
-        offload_modules = config["megatron_cfg"].get("offload_modules", [])
+        offload_modules = config["megatron_cfg"].get("offload_modules")
         if not isinstance(offload_modules, list) or not offload_modules:
             raise ValueError(
                 "offload_modules must be a non-empty list when "

From 502d2ddf1aeecfd2a7ecad6e325f158143941d98 Mon Sep 17 00:00:00 2001
From: Seonjin <sna@nvidia.com>
Date: Mon, 18 May 2026 22:14:48 -0700
Subject: [PATCH 08/13] Update nemo_rl/models/megatron/setup.py

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>
Signed-off-by: Seonjin <sna@nvidia.com>
---
 nemo_rl/models/megatron/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 1857185048..b141215874 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -678,7 +678,7 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
     # valid module names and their per-model-type compatibility, so we only
     # require a non-empty list here and let Megatron validate the contents.
     fine_grained_activation_offloading = config["megatron_cfg"].get(
-        "fine_grained_activation_offloading", False
+        "fine_grained_activation_offloading"
     )
     if fine_grained_activation_offloading:
         offload_modules = config["megatron_cfg"].get("offload_modules")

From da947f801689a9e13e3d885599731b111d6f5e72 Mon Sep 17 00:00:00 2001
From: Seonjin <sna@nvidia.com>
Date: Mon, 18 May 2026 22:38:18 -0700
Subject: [PATCH 09/13] Update
 tests/unit/models/megatron/test_megatron_setup.py

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>
Signed-off-by: Seonjin <sna@nvidia.com>
---
 tests/unit/models/megatron/test_megatron_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py
index c2dce64c31..ff081ba68c 100644
--- a/tests/unit/models/megatron/test_megatron_setup.py
+++ b/tests/unit/models/megatron/test_megatron_setup.py
@@ -864,7 +864,7 @@ def test_fine_grained_activation_offloading_invalid_modules_raises(
         with pytest.raises(
             ValueError, match="offload_modules must be a non-empty list"
         ):
-            _apply_performance_config(model_cfg, config)
+        """When enabled but offload_modules key is absent, defaults to None → raises."""
 
     def test_fine_grained_activation_offloading_missing_modules_raises(self):
         """When enabled but offload_modules key is absent, defaults to [] → raises."""

From 254c7d097de8646f0b0671b22004218e96488371 Mon Sep 17 00:00:00 2001
From: seonjinn <sna@nvidia.com>
Date: Tue, 19 May 2026 14:04:01 -0700
Subject: [PATCH 10/13] Fix syntax error from main merge in offload test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The merge resolution left an empty 'with pytest.raises(...)' block —
restore the _apply_performance_config(model_cfg, config) call inside it.

Signed-off-by: seonjinn <sna@nvidia.com>
---
 tests/unit/models/megatron/test_megatron_setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py
index 4ec60bda13..617428fd11 100644
--- a/tests/unit/models/megatron/test_megatron_setup.py
+++ b/tests/unit/models/megatron/test_megatron_setup.py
@@ -1075,7 +1075,7 @@ def test_fine_grained_activation_offloading_invalid_modules_raises(
         with pytest.raises(
             ValueError, match="offload_modules must be a non-empty list"
         ):
-        """When enabled but offload_modules key is absent, defaults to None → raises."""
+            _apply_performance_config(model_cfg, config)
 
     def test_fine_grained_activation_offloading_missing_modules_raises(self):
         """When enabled but offload_modules key is absent, defaults to [] → raises."""

From ed86517d14ffc7d3266eb5394570c079d383b3a3 Mon Sep 17 00:00:00 2001
From: seonjinn <sna@nvidia.com>
Date: Sat, 23 May 2026 00:10:47 -0700
Subject: [PATCH 11/13] Address review comments on
 fine_grained_activation_offloading

- Add fine_grained_activation_offloading and offload_modules to all
  megatron-capable exemplar configs (grpo_math_1B, grpo_math_1B_megatron,
  sft, dpo, distillation_math, distillation_math_megatron).
- Sync tests/unit/reference_configs/*.yaml with the new keys so
  test_reference_configs_up_to_date passes.
- Trim setup.py block comment to a single line per reviewer feedback.
- Fix test docstring to reflect that .get() defaults to None.

Signed-off-by: seonjinn <sna@nvidia.com>
---
 examples/configs/distillation_math.yaml             | 9 +++++++++
 examples/configs/distillation_math_megatron.yaml    | 9 +++++++++
 examples/configs/dpo.yaml                           | 9 +++++++++
 examples/configs/grpo_math_1B.yaml                  | 9 +++++++++
 examples/configs/grpo_math_1B_megatron.yaml         | 9 +++++++++
 examples/configs/sft.yaml                           | 9 +++++++++
 nemo_rl/models/megatron/setup.py                    | 6 +-----
 tests/unit/models/megatron/test_megatron_setup.py   | 2 +-
 tests/unit/reference_configs/distillation_math.yaml | 2 ++
 tests/unit/reference_configs/dpo.yaml               | 2 ++
 tests/unit/reference_configs/grpo_math_1B.yaml      | 2 ++
 tests/unit/reference_configs/sft.yaml               | 2 ++
 12 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml
index ab88186661..a1e6694467 100644
--- a/examples/configs/distillation_math.yaml
+++ b/examples/configs/distillation_math.yaml
@@ -90,6 +90,15 @@ policy: &POLICY_BASE
         force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
         empty_unused_memory_level: 0
         activation_checkpointing: false
+        # Offload specific module activations to CPU. Works for both dense and MoE
+        # models. Requires transformer_engine. Different from optimizer_cpu_offload
+        # which offloads optimizer states.
+        fine_grained_activation_offloading: false
+        # Modules to offload when fine_grained_activation_offloading is true.
+        # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+        # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+        # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+        offload_modules: null
         converter_type: "Qwen3ForCausalLM"
         tensor_model_parallel_size: 2
         expert_tensor_parallel_size: 1
diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml
index 5c0eb71a90..d2e56aab6f 100644
--- a/examples/configs/distillation_math_megatron.yaml
+++ b/examples/configs/distillation_math_megatron.yaml
@@ -39,6 +39,15 @@ policy: &POLICY_BASE
         enabled: true
         empty_unused_memory_level: 0
         activation_checkpointing: false
+        # Offload specific module activations to CPU. Works for both dense and MoE
+        # models. Requires transformer_engine. Different from optimizer_cpu_offload
+        # which offloads optimizer states.
+        fine_grained_activation_offloading: false
+        # Modules to offload when fine_grained_activation_offloading is true.
+        # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+        # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+        # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+        offload_modules: null
         converter_type: "Qwen3ForCausalLM"
         tensor_model_parallel_size: 2
         expert_tensor_parallel_size: 1
diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
index 17a49669e0..d68bcbcd40 100755
--- a/examples/configs/dpo.yaml
+++ b/examples/configs/dpo.yaml
@@ -129,6 +129,15 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1
     activation_checkpointing: false
+    # Offload specific module activations to CPU. Works for both dense and MoE
+    # models. Requires transformer_engine. Different from optimizer_cpu_offload
+    # which offloads optimizer states.
+    fine_grained_activation_offloading: false
+    # Modules to offload when fine_grained_activation_offloading is true.
+    # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+    # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: null
     tensor_model_parallel_size: 2
     expert_tensor_parallel_size: 1
     expert_model_parallel_size: 1
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
index 4e2b8241f2..66e3b61d4e 100644
--- a/examples/configs/grpo_math_1B.yaml
+++ b/examples/configs/grpo_math_1B.yaml
@@ -134,6 +134,15 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     activation_checkpointing: false
+    # Offload specific module activations to CPU. Works for both dense and MoE
+    # models. Requires transformer_engine. Different from optimizer_cpu_offload
+    # which offloads optimizer states.
+    fine_grained_activation_offloading: false
+    # Modules to offload when fine_grained_activation_offloading is true.
+    # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+    # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: null
     converter_type: "Qwen2ForCausalLM"
     tensor_model_parallel_size: 1
     expert_tensor_parallel_size: 1
diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml
index 084d662130..68a7f5ea5c 100644
--- a/examples/configs/grpo_math_1B_megatron.yaml
+++ b/examples/configs/grpo_math_1B_megatron.yaml
@@ -86,6 +86,15 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     activation_checkpointing: false
+    # Offload specific module activations to CPU. Works for both dense and MoE
+    # models. Requires transformer_engine. Different from optimizer_cpu_offload
+    # which offloads optimizer states.
+    fine_grained_activation_offloading: false
+    # Modules to offload when fine_grained_activation_offloading is true.
+    # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+    # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: null
     converter_type: "Qwen2ForCausalLM"
     tensor_model_parallel_size: 1
     expert_tensor_parallel_size: 1
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
index cf02bdfc74..1ff1e18deb 100644
--- a/examples/configs/sft.yaml
+++ b/examples/configs/sft.yaml
@@ -110,6 +110,15 @@ policy:
     env_vars: {}
     empty_unused_memory_level: 1
     activation_checkpointing: false
+    # Offload specific module activations to CPU. Works for both dense and MoE
+    # models. Requires transformer_engine. Different from optimizer_cpu_offload
+    # which offloads optimizer states.
+    fine_grained_activation_offloading: false
+    # Modules to offload when fine_grained_activation_offloading is true.
+    # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
+    # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
+    # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
+    offload_modules: null
     tensor_model_parallel_size: 1
     expert_tensor_parallel_size: 1
     expert_model_parallel_size: 1
diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index 5e6bfe6d9f..d3e69c3ac2 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -718,11 +718,7 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
                 "Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue."
             )
 
-    # Fine-grained activation offloading moves specified submodule activations
-    # to CPU. Works for both dense and MoE models; the user picks which
-    # submodules to offload via offload_modules. Megatron owns the list of
-    # valid module names and their per-model-type compatibility, so we only
-    # require a non-empty list here and let Megatron validate the contents.
+    # Megatron validates module names and per-model-type compatibility.
     fine_grained_activation_offloading = config["megatron_cfg"].get(
         "fine_grained_activation_offloading"
     )
diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py
index 617428fd11..5ee070c74d 100644
--- a/tests/unit/models/megatron/test_megatron_setup.py
+++ b/tests/unit/models/megatron/test_megatron_setup.py
@@ -1078,7 +1078,7 @@ def test_fine_grained_activation_offloading_invalid_modules_raises(
             _apply_performance_config(model_cfg, config)
 
     def test_fine_grained_activation_offloading_missing_modules_raises(self):
-        """When enabled but offload_modules key is absent, defaults to [] → raises."""
+        """When enabled but offload_modules key is absent, defaults to None → raises."""
         from nemo_rl.models.megatron.setup import _apply_performance_config
 
         model_cfg = MagicMock()
diff --git a/tests/unit/reference_configs/distillation_math.yaml b/tests/unit/reference_configs/distillation_math.yaml
index ab88186661..1ff9917b51 100644
--- a/tests/unit/reference_configs/distillation_math.yaml
+++ b/tests/unit/reference_configs/distillation_math.yaml
@@ -90,6 +90,8 @@ policy: &POLICY_BASE
         force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
         empty_unused_memory_level: 0
         activation_checkpointing: false
+        fine_grained_activation_offloading: false
+        offload_modules: null
         converter_type: "Qwen3ForCausalLM"
         tensor_model_parallel_size: 2
         expert_tensor_parallel_size: 1
diff --git a/tests/unit/reference_configs/dpo.yaml b/tests/unit/reference_configs/dpo.yaml
index 415512addb..b2a930dc02 100755
--- a/tests/unit/reference_configs/dpo.yaml
+++ b/tests/unit/reference_configs/dpo.yaml
@@ -124,6 +124,8 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1
     activation_checkpointing: false
+    fine_grained_activation_offloading: false
+    offload_modules: null
     tensor_model_parallel_size: 2
     expert_tensor_parallel_size: 1
     expert_model_parallel_size: 1
diff --git a/tests/unit/reference_configs/grpo_math_1B.yaml b/tests/unit/reference_configs/grpo_math_1B.yaml
index b3bf195c1c..2167e0b1fa 100644
--- a/tests/unit/reference_configs/grpo_math_1B.yaml
+++ b/tests/unit/reference_configs/grpo_math_1B.yaml
@@ -135,6 +135,8 @@ policy:
     force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
     empty_unused_memory_level: 1  # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
     activation_checkpointing: false
+    fine_grained_activation_offloading: false
+    offload_modules: null
     converter_type: "Qwen2ForCausalLM"
     tensor_model_parallel_size: 1
     expert_tensor_parallel_size: 1
diff --git a/tests/unit/reference_configs/sft.yaml b/tests/unit/reference_configs/sft.yaml
index 416895b635..6f03e7cbd6 100644
--- a/tests/unit/reference_configs/sft.yaml
+++ b/tests/unit/reference_configs/sft.yaml
@@ -105,6 +105,8 @@ policy:
     env_vars: {}
     empty_unused_memory_level: 1
     activation_checkpointing: false
+    fine_grained_activation_offloading: false
+    offload_modules: null
     tensor_model_parallel_size: 1
     expert_tensor_parallel_size: 1
     expert_model_parallel_size: 1

From d89ac84382a652147cc9deb6090f96c8518df80c Mon Sep 17 00:00:00 2001
From: seonjinn <sna@nvidia.com>
Date: Sat, 23 May 2026 00:13:02 -0700
Subject: [PATCH 12/13] Address remaining terrykong review comments

- Add NVTE_CPU_OFFLOAD_V1=1 note (TE >= 2.10.0) to TypedDict comment in
  policy/__init__.py so users see the env requirement up front rather
  than via a late Megatron-Bridge validation error.
- Document the NUMA affinity gap in megatron/setup.py: Megatron-Bridge's
  standalone path calls set_ideal_affinity_for_current_gpu() when this
  feature is on; NeMo-RL does not, so this comment points users who care
  about offload bandwidth at the external workaround.

Signed-off-by: seonjinn <sna@nvidia.com>
---
 nemo_rl/models/megatron/setup.py  | 4 ++++
 nemo_rl/models/policy/__init__.py | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py
index d3e69c3ac2..29266a5b4e 100644
--- a/nemo_rl/models/megatron/setup.py
+++ b/nemo_rl/models/megatron/setup.py
@@ -719,6 +719,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
             )
 
     # Megatron validates module names and per-model-type compatibility.
+    # Note: Megatron-Bridge's standalone training path also sets NUMA-aware
+    # CPU affinity via set_ideal_affinity_for_current_gpu() when this is on,
+    # which improves PCIe/DRAM throughput. NeMo-RL does not call it; users
+    # who need maximum offload bandwidth may want to set affinity externally.
     fine_grained_activation_offloading = config["megatron_cfg"].get(
         "fine_grained_activation_offloading"
     )
diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index 3daabb7d45..64c49dd0c0 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -240,7 +240,9 @@ class MegatronConfig(TypedDict):
     # Offload specific module activations to CPU to reduce peak GPU memory.
     # Works with both dense and MoE models. Different from
     # optimizer_cpu_offload which offloads optimizer states.
-    # Requires transformer_engine implementation.
+    # Requires transformer_engine. For TE >= 2.10.0 also requires
+    # NVTE_CPU_OFFLOAD_V1=1 in the environment (validated by
+    # Megatron-Bridge at runtime).
     fine_grained_activation_offloading: NotRequired[bool]
     # Modules to offload when fine_grained_activation_offloading is True.
     # Required (no default). Valid values:

From aaece7b131786e62671ddae8e1c4598776357e16 Mon Sep 17 00:00:00 2001
From: seonjinn <sna@nvidia.com>
Date: Sat, 23 May 2026 00:39:08 -0700
Subject: [PATCH 13/13] Allow offload_modules to be None in MegatronConfig
 TypedDict

NotRequired[list[str]] caused pydantic to reject `offload_modules: null`
in YAML, breaking L1 run_vlm_grpo (and any recipe loading an exemplar
megatron config when the feature is off). Wrap with Optional so the
exemplar default `null` is accepted; the runtime validation in setup.py
already raises if the feature is on with a non-list / empty value.

Signed-off-by: seonjinn <sna@nvidia.com>
---
 nemo_rl/models/policy/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py
index 64c49dd0c0..b336703c86 100644
--- a/nemo_rl/models/policy/__init__.py
+++ b/nemo_rl/models/policy/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Any, Literal, NotRequired, TypedDict, Union
+from typing import Any, Literal, NotRequired, Optional, TypedDict, Union
 
 from nemo_rl.models.generation.interfaces import GenerationConfig
 from nemo_rl.utils.checkpoint import PretrainedCheckpointConfig
@@ -249,7 +249,7 @@ class MegatronConfig(TypedDict):
     # "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm",
     # "expert_fc1", "moe_act". Note: "attn_proj" requires "core_attn".
     # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
-    offload_modules: NotRequired[list[str]]
+    offload_modules: NotRequired[Optional[list[str]]]
     # Enable grouped GEMM for MoE experts via CUTLASS. Significant throughput
     # gain when multiple experts are assigned per rank (num_local_experts > 1).
     # Requires TE >= 1.11.0 for FP8 and Ampere (sm_80) or newer.