From feee53ef4e35418b352a7aaefcdcf21ed35e48cd Mon Sep 17 00:00:00 2001 From: sna Date: Fri, 17 Apr 2026 00:08:44 -0700 Subject: [PATCH 01/13] Add fine-grained activation offloading for Megatron policy Exposes Megatron-Core fine_grained_activation_offloading and offload_modules through PolicyConfig so training can offload specific submodule activations (moe_act, core_attn, qkv_linear, mlp_norm, attn_norm) to CPU. Works for both dense and MoE models. Validation of module names is left to Megatron. Signed-off-by: sna --- nemo_rl/models/megatron/setup.py | 18 ++++++++++++++++++ nemo_rl/models/policy/__init__.py | 8 ++++++++ 2 files changed, 26 insertions(+) diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index fc5c6c44fa..4bdc1eb0a2 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -514,6 +514,24 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: "Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue." ) + # Fine-grained activation offloading moves specified submodule activations + # to CPU. Works for both dense and MoE models; the user picks which + # submodules to offload via offload_modules. Megatron owns the list of + # valid module names and their per-model-type compatibility, so we only + # require a non-empty list here and let Megatron validate the contents. + fine_grained_activation_offloading = config["megatron_cfg"].get( + "fine_grained_activation_offloading", False + ) + if fine_grained_activation_offloading: + offload_modules = config["megatron_cfg"].get("offload_modules", []) + if not offload_modules: + raise ValueError( + "offload_modules must be a non-empty list when " + "fine_grained_activation_offloading is True." + ) + model_cfg.fine_grained_activation_offloading = True + model_cfg.offload_modules = offload_modules + def _validate_optimizer_config(config: PolicyConfig) -> None: """Validate optimizer configuration.""" diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index ec4c9e66bb..28d9a8bbc7 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -236,6 +236,14 @@ class MegatronConfig(TypedDict): moe_token_dispatcher_type: str # Can be used only with 'alltoall' token dispatcher moe_shared_expert_overlap: bool + # Offload specific module activations to CPU to reduce peak GPU memory. + # Works with MoE models (offloads MoE expert activations). Different from + # optimizer_cpu_offload which offloads optimizer states. + fine_grained_activation_offloading: NotRequired[bool] + # Modules to offload when fine_grained_activation_offloading is True. + # Defaults to ["moe_act"] if not specified. Valid values include: + # "moe_act", "core_attn", "qkv_linear", "mlp_norm", "attn_norm". + offload_modules: NotRequired[list[str]] peft: NotRequired[MegatronPeftConfig | MegatronPeftConfigDisabled] optimizer: MegatronOptimizerConfig scheduler: MegatronSchedulerConfig From 0b530c4d4dcff294584b963fd453f2142d1b5c3f Mon Sep 17 00:00:00 2001 From: Seonjin Date: Thu, 23 Apr 2026 14:31:47 -0700 Subject: [PATCH 02/13] Update nemo_rl/models/policy/__init__.py Co-authored-by: Terry Kong Signed-off-by: Seonjin --- nemo_rl/models/policy/__init__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 28d9a8bbc7..fa59cd94e6 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -237,12 +237,15 @@ class MegatronConfig(TypedDict): # Can be used only with 'alltoall' token dispatcher moe_shared_expert_overlap: bool # Offload specific module activations to CPU to reduce peak GPU memory. - # Works with MoE models (offloads MoE expert activations). Different from + # Works with both dense and MoE models. Different from # optimizer_cpu_offload which offloads optimizer states. + # Requires transformer_engine implementation. fine_grained_activation_offloading: NotRequired[bool] # Modules to offload when fine_grained_activation_offloading is True. - # Defaults to ["moe_act"] if not specified. Valid values include: - # "moe_act", "core_attn", "qkv_linear", "mlp_norm", "attn_norm". + # Required (no default). Valid values: + # "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm", + # "expert_fc1", "moe_act". Note: "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 offload_modules: NotRequired[list[str]] peft: NotRequired[MegatronPeftConfig | MegatronPeftConfigDisabled] optimizer: MegatronOptimizerConfig From d5df80afa833702a1a96f808275463d70e43d427 Mon Sep 17 00:00:00 2001 From: Seonjin Date: Thu, 23 Apr 2026 14:32:14 -0700 Subject: [PATCH 03/13] Update nemo_rl/models/megatron/setup.py Co-authored-by: Terry Kong Signed-off-by: Seonjin --- nemo_rl/models/megatron/setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 4bdc1eb0a2..caef9c10d9 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -524,11 +524,12 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: ) if fine_grained_activation_offloading: offload_modules = config["megatron_cfg"].get("offload_modules", []) - if not offload_modules: + if not isinstance(offload_modules, list) or not offload_modules: raise ValueError( "offload_modules must be a non-empty list when " "fine_grained_activation_offloading is True." ) + ) model_cfg.fine_grained_activation_offloading = True model_cfg.offload_modules = offload_modules From e23798739aace8e26989726f05e4902d78ea16b4 Mon Sep 17 00:00:00 2001 From: sna Date: Thu, 14 May 2026 14:16:44 -0700 Subject: [PATCH 04/13] fix: remove stray paren in setup.py raising ValueError A stray closing parenthesis after the raise ValueError block caused a SyntaxError, blocking the ruff/ruff-format pre-commit hooks in CI. Signed-off-by: sna --- nemo_rl/models/megatron/setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 685e8aa3e7..841ac8763b 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -687,7 +687,6 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: "offload_modules must be a non-empty list when " "fine_grained_activation_offloading is True." ) - ) model_cfg.fine_grained_activation_offloading = True model_cfg.offload_modules = offload_modules From 06b4d4abcfd3eebe1356827bfeccba66c7bed8c4 Mon Sep 17 00:00:00 2001 From: sna Date: Fri, 15 May 2026 11:52:45 -0700 Subject: [PATCH 05/13] fix: pin NeMo Gym docs URL to v0.2.1 (latest 404) Signed-off-by: sna --- docs/design-docs/nemo-gym-integration.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/design-docs/nemo-gym-integration.md b/docs/design-docs/nemo-gym-integration.md index 33e324547b..0263d36fef 100644 --- a/docs/design-docs/nemo-gym-integration.md +++ b/docs/design-docs/nemo-gym-integration.md @@ -181,7 +181,7 @@ sequenceDiagram GRPO->>Policy: Compute loss and train ``` -> **NeMo Gym server types** (see [Core Components](https://docs.nvidia.com/nemo/gym/latest/about/concepts/core-components.html)): +> **NeMo Gym server types** (see [Core Components](https://docs.nvidia.com/nemo/gym/v0.2.1/about/concepts/core-components/)): > - **Agent Server**: Orchestrates the rollout loop > - **Model Server**: HTTP proxy to vLLM; translates Responses API ↔ Chat Completions > - **Resource Server**: Provides tools and rewards From 522521770628acb6ff9d31407d8c92fdb825261b Mon Sep 17 00:00:00 2001 From: sna Date: Sat, 16 May 2026 15:23:14 -0700 Subject: [PATCH 06/13] test: add unit tests for fine_grained_activation_offloading branch Covers _apply_performance_config offload-modules dispatch: - happy path: True + non-empty list sets both attrs - disabled: defaults skip the branch (no attrs touched) - invalid offload_modules ([], None, str, int) all raise ValueError - missing offload_modules key raises ValueError Lifts patch coverage above codecov target. Signed-off-by: sna --- .../models/megatron/test_megatron_setup.py | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py index 1eaa3a1247..c2dce64c31 100644 --- a/tests/unit/models/megatron/test_megatron_setup.py +++ b/tests/unit/models/megatron/test_megatron_setup.py @@ -794,6 +794,99 @@ def test_fp8_param_warning(self): with pytest.warns(UserWarning, match="fp8_param=True sometimes causes NaN"): _apply_performance_config(model_cfg, config) + def test_fine_grained_activation_offloading_enabled(self): + """Test happy path: enabled with non-empty offload_modules list.""" + from nemo_rl.models.megatron.setup import _apply_performance_config + + model_cfg = MagicMock() + model_cfg.gated_linear_unit = True + offload_modules = ["mlp", "moe_act"] + config = { + "megatron_cfg": { + "activation_checkpointing": False, + "apply_rope_fusion": False, + "bias_activation_fusion": False, + "gradient_accumulation_fusion": False, + "fine_grained_activation_offloading": True, + "offload_modules": offload_modules, + } + } + + _apply_performance_config(model_cfg, config) + + assert model_cfg.fine_grained_activation_offloading is True + assert model_cfg.offload_modules == offload_modules + + def test_fine_grained_activation_offloading_disabled_skips(self): + """When flag is False (default), no offload attrs should be set.""" + from nemo_rl.models.megatron.setup import _apply_performance_config + + model_cfg = MagicMock(spec=["gated_linear_unit"]) + model_cfg.gated_linear_unit = True + config = { + "megatron_cfg": { + "activation_checkpointing": False, + "apply_rope_fusion": False, + "bias_activation_fusion": False, + "gradient_accumulation_fusion": False, + } + } + + _apply_performance_config(model_cfg, config) + + assert not hasattr(model_cfg, "fine_grained_activation_offloading") + assert not hasattr(model_cfg, "offload_modules") + + @pytest.mark.parametrize( + "offload_modules", + [[], None, "mlp", 42], + ids=["empty_list", "none", "string", "int"], + ) + def test_fine_grained_activation_offloading_invalid_modules_raises( + self, offload_modules + ): + """offload_modules must be a non-empty list when feature is enabled.""" + from nemo_rl.models.megatron.setup import _apply_performance_config + + model_cfg = MagicMock() + model_cfg.gated_linear_unit = True + config = { + "megatron_cfg": { + "activation_checkpointing": False, + "apply_rope_fusion": False, + "bias_activation_fusion": False, + "gradient_accumulation_fusion": False, + "fine_grained_activation_offloading": True, + "offload_modules": offload_modules, + } + } + + with pytest.raises( + ValueError, match="offload_modules must be a non-empty list" + ): + _apply_performance_config(model_cfg, config) + + def test_fine_grained_activation_offloading_missing_modules_raises(self): + """When enabled but offload_modules key is absent, defaults to [] → raises.""" + from nemo_rl.models.megatron.setup import _apply_performance_config + + model_cfg = MagicMock() + model_cfg.gated_linear_unit = True + config = { + "megatron_cfg": { + "activation_checkpointing": False, + "apply_rope_fusion": False, + "bias_activation_fusion": False, + "gradient_accumulation_fusion": False, + "fine_grained_activation_offloading": True, + } + } + + with pytest.raises( + ValueError, match="offload_modules must be a non-empty list" + ): + _apply_performance_config(model_cfg, config) + @pytest.mark.mcore class TestValidateOptimizerConfig: From 211e31a37dc576837437582842774ad7b30ee172 Mon Sep 17 00:00:00 2001 From: Seonjin Date: Mon, 18 May 2026 22:14:36 -0700 Subject: [PATCH 07/13] Update nemo_rl/models/megatron/setup.py Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> Signed-off-by: Seonjin --- nemo_rl/models/megatron/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 841ac8763b..1857185048 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -681,7 +681,7 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: "fine_grained_activation_offloading", False ) if fine_grained_activation_offloading: - offload_modules = config["megatron_cfg"].get("offload_modules", []) + offload_modules = config["megatron_cfg"].get("offload_modules") if not isinstance(offload_modules, list) or not offload_modules: raise ValueError( "offload_modules must be a non-empty list when " From 502d2ddf1aeecfd2a7ecad6e325f158143941d98 Mon Sep 17 00:00:00 2001 From: Seonjin Date: Mon, 18 May 2026 22:14:48 -0700 Subject: [PATCH 08/13] Update nemo_rl/models/megatron/setup.py Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> Signed-off-by: Seonjin --- nemo_rl/models/megatron/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 1857185048..b141215874 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -678,7 +678,7 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: # valid module names and their per-model-type compatibility, so we only # require a non-empty list here and let Megatron validate the contents. fine_grained_activation_offloading = config["megatron_cfg"].get( - "fine_grained_activation_offloading", False + "fine_grained_activation_offloading" ) if fine_grained_activation_offloading: offload_modules = config["megatron_cfg"].get("offload_modules") From da947f801689a9e13e3d885599731b111d6f5e72 Mon Sep 17 00:00:00 2001 From: Seonjin Date: Mon, 18 May 2026 22:38:18 -0700 Subject: [PATCH 09/13] Update tests/unit/models/megatron/test_megatron_setup.py Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com> Signed-off-by: Seonjin --- tests/unit/models/megatron/test_megatron_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py index c2dce64c31..ff081ba68c 100644 --- a/tests/unit/models/megatron/test_megatron_setup.py +++ b/tests/unit/models/megatron/test_megatron_setup.py @@ -864,7 +864,7 @@ def test_fine_grained_activation_offloading_invalid_modules_raises( with pytest.raises( ValueError, match="offload_modules must be a non-empty list" ): - _apply_performance_config(model_cfg, config) + """When enabled but offload_modules key is absent, defaults to None → raises.""" def test_fine_grained_activation_offloading_missing_modules_raises(self): """When enabled but offload_modules key is absent, defaults to [] → raises.""" From 254c7d097de8646f0b0671b22004218e96488371 Mon Sep 17 00:00:00 2001 From: seonjinn Date: Tue, 19 May 2026 14:04:01 -0700 Subject: [PATCH 10/13] Fix syntax error from main merge in offload test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The merge resolution left an empty 'with pytest.raises(...)' block — restore the _apply_performance_config(model_cfg, config) call inside it. Signed-off-by: seonjinn --- tests/unit/models/megatron/test_megatron_setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py index 4ec60bda13..617428fd11 100644 --- a/tests/unit/models/megatron/test_megatron_setup.py +++ b/tests/unit/models/megatron/test_megatron_setup.py @@ -1075,7 +1075,7 @@ def test_fine_grained_activation_offloading_invalid_modules_raises( with pytest.raises( ValueError, match="offload_modules must be a non-empty list" ): - """When enabled but offload_modules key is absent, defaults to None → raises.""" + _apply_performance_config(model_cfg, config) def test_fine_grained_activation_offloading_missing_modules_raises(self): """When enabled but offload_modules key is absent, defaults to [] → raises.""" From ed86517d14ffc7d3266eb5394570c079d383b3a3 Mon Sep 17 00:00:00 2001 From: seonjinn Date: Sat, 23 May 2026 00:10:47 -0700 Subject: [PATCH 11/13] Address review comments on fine_grained_activation_offloading - Add fine_grained_activation_offloading and offload_modules to all megatron-capable exemplar configs (grpo_math_1B, grpo_math_1B_megatron, sft, dpo, distillation_math, distillation_math_megatron). - Sync tests/unit/reference_configs/*.yaml with the new keys so test_reference_configs_up_to_date passes. - Trim setup.py block comment to a single line per reviewer feedback. - Fix test docstring to reflect that .get() defaults to None. Signed-off-by: seonjinn --- examples/configs/distillation_math.yaml | 9 +++++++++ examples/configs/distillation_math_megatron.yaml | 9 +++++++++ examples/configs/dpo.yaml | 9 +++++++++ examples/configs/grpo_math_1B.yaml | 9 +++++++++ examples/configs/grpo_math_1B_megatron.yaml | 9 +++++++++ examples/configs/sft.yaml | 9 +++++++++ nemo_rl/models/megatron/setup.py | 6 +----- tests/unit/models/megatron/test_megatron_setup.py | 2 +- tests/unit/reference_configs/distillation_math.yaml | 2 ++ tests/unit/reference_configs/dpo.yaml | 2 ++ tests/unit/reference_configs/grpo_math_1B.yaml | 2 ++ tests/unit/reference_configs/sft.yaml | 2 ++ 12 files changed, 64 insertions(+), 6 deletions(-) diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml index ab88186661..a1e6694467 100644 --- a/examples/configs/distillation_math.yaml +++ b/examples/configs/distillation_math.yaml @@ -90,6 +90,15 @@ policy: &POLICY_BASE force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 0 activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null converter_type: "Qwen3ForCausalLM" tensor_model_parallel_size: 2 expert_tensor_parallel_size: 1 diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml index 5c0eb71a90..d2e56aab6f 100644 --- a/examples/configs/distillation_math_megatron.yaml +++ b/examples/configs/distillation_math_megatron.yaml @@ -39,6 +39,15 @@ policy: &POLICY_BASE enabled: true empty_unused_memory_level: 0 activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null converter_type: "Qwen3ForCausalLM" tensor_model_parallel_size: 2 expert_tensor_parallel_size: 1 diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml index 17a49669e0..d68bcbcd40 100755 --- a/examples/configs/dpo.yaml +++ b/examples/configs/dpo.yaml @@ -129,6 +129,15 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null tensor_model_parallel_size: 2 expert_tensor_parallel_size: 1 expert_model_parallel_size: 1 diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml index 4e2b8241f2..66e3b61d4e 100644 --- a/examples/configs/grpo_math_1B.yaml +++ b/examples/configs/grpo_math_1B.yaml @@ -134,6 +134,15 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null converter_type: "Qwen2ForCausalLM" tensor_model_parallel_size: 1 expert_tensor_parallel_size: 1 diff --git a/examples/configs/grpo_math_1B_megatron.yaml b/examples/configs/grpo_math_1B_megatron.yaml index 084d662130..68a7f5ea5c 100644 --- a/examples/configs/grpo_math_1B_megatron.yaml +++ b/examples/configs/grpo_math_1B_megatron.yaml @@ -86,6 +86,15 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null converter_type: "Qwen2ForCausalLM" tensor_model_parallel_size: 1 expert_tensor_parallel_size: 1 diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml index cf02bdfc74..1ff1e18deb 100644 --- a/examples/configs/sft.yaml +++ b/examples/configs/sft.yaml @@ -110,6 +110,15 @@ policy: env_vars: {} empty_unused_memory_level: 1 activation_checkpointing: false + # Offload specific module activations to CPU. Works for both dense and MoE + # models. Requires transformer_engine. Different from optimizer_cpu_offload + # which offloads optimizer states. + fine_grained_activation_offloading: false + # Modules to offload when fine_grained_activation_offloading is true. + # Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj", + # "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn". + # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 + offload_modules: null tensor_model_parallel_size: 1 expert_tensor_parallel_size: 1 expert_model_parallel_size: 1 diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index 5e6bfe6d9f..d3e69c3ac2 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -718,11 +718,7 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: "Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue." ) - # Fine-grained activation offloading moves specified submodule activations - # to CPU. Works for both dense and MoE models; the user picks which - # submodules to offload via offload_modules. Megatron owns the list of - # valid module names and their per-model-type compatibility, so we only - # require a non-empty list here and let Megatron validate the contents. + # Megatron validates module names and per-model-type compatibility. fine_grained_activation_offloading = config["megatron_cfg"].get( "fine_grained_activation_offloading" ) diff --git a/tests/unit/models/megatron/test_megatron_setup.py b/tests/unit/models/megatron/test_megatron_setup.py index 617428fd11..5ee070c74d 100644 --- a/tests/unit/models/megatron/test_megatron_setup.py +++ b/tests/unit/models/megatron/test_megatron_setup.py @@ -1078,7 +1078,7 @@ def test_fine_grained_activation_offloading_invalid_modules_raises( _apply_performance_config(model_cfg, config) def test_fine_grained_activation_offloading_missing_modules_raises(self): - """When enabled but offload_modules key is absent, defaults to [] → raises.""" + """When enabled but offload_modules key is absent, defaults to None → raises.""" from nemo_rl.models.megatron.setup import _apply_performance_config model_cfg = MagicMock() diff --git a/tests/unit/reference_configs/distillation_math.yaml b/tests/unit/reference_configs/distillation_math.yaml index ab88186661..1ff9917b51 100644 --- a/tests/unit/reference_configs/distillation_math.yaml +++ b/tests/unit/reference_configs/distillation_math.yaml @@ -90,6 +90,8 @@ policy: &POLICY_BASE force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 0 activation_checkpointing: false + fine_grained_activation_offloading: false + offload_modules: null converter_type: "Qwen3ForCausalLM" tensor_model_parallel_size: 2 expert_tensor_parallel_size: 1 diff --git a/tests/unit/reference_configs/dpo.yaml b/tests/unit/reference_configs/dpo.yaml index 415512addb..b2a930dc02 100755 --- a/tests/unit/reference_configs/dpo.yaml +++ b/tests/unit/reference_configs/dpo.yaml @@ -124,6 +124,8 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 activation_checkpointing: false + fine_grained_activation_offloading: false + offload_modules: null tensor_model_parallel_size: 2 expert_tensor_parallel_size: 1 expert_model_parallel_size: 1 diff --git a/tests/unit/reference_configs/grpo_math_1B.yaml b/tests/unit/reference_configs/grpo_math_1B.yaml index b3bf195c1c..2167e0b1fa 100644 --- a/tests/unit/reference_configs/grpo_math_1B.yaml +++ b/tests/unit/reference_configs/grpo_math_1B.yaml @@ -135,6 +135,8 @@ policy: force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory. activation_checkpointing: false + fine_grained_activation_offloading: false + offload_modules: null converter_type: "Qwen2ForCausalLM" tensor_model_parallel_size: 1 expert_tensor_parallel_size: 1 diff --git a/tests/unit/reference_configs/sft.yaml b/tests/unit/reference_configs/sft.yaml index 416895b635..6f03e7cbd6 100644 --- a/tests/unit/reference_configs/sft.yaml +++ b/tests/unit/reference_configs/sft.yaml @@ -105,6 +105,8 @@ policy: env_vars: {} empty_unused_memory_level: 1 activation_checkpointing: false + fine_grained_activation_offloading: false + offload_modules: null tensor_model_parallel_size: 1 expert_tensor_parallel_size: 1 expert_model_parallel_size: 1 From d89ac84382a652147cc9deb6090f96c8518df80c Mon Sep 17 00:00:00 2001 From: seonjinn Date: Sat, 23 May 2026 00:13:02 -0700 Subject: [PATCH 12/13] Address remaining terrykong review comments - Add NVTE_CPU_OFFLOAD_V1=1 note (TE >= 2.10.0) to TypedDict comment in policy/__init__.py so users see the env requirement up front rather than via a late Megatron-Bridge validation error. - Document the NUMA affinity gap in megatron/setup.py: Megatron-Bridge's standalone path calls set_ideal_affinity_for_current_gpu() when this feature is on; NeMo-RL does not, so this comment points users who care about offload bandwidth at the external workaround. Signed-off-by: seonjinn --- nemo_rl/models/megatron/setup.py | 4 ++++ nemo_rl/models/policy/__init__.py | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo_rl/models/megatron/setup.py b/nemo_rl/models/megatron/setup.py index d3e69c3ac2..29266a5b4e 100644 --- a/nemo_rl/models/megatron/setup.py +++ b/nemo_rl/models/megatron/setup.py @@ -719,6 +719,10 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: ) # Megatron validates module names and per-model-type compatibility. + # Note: Megatron-Bridge's standalone training path also sets NUMA-aware + # CPU affinity via set_ideal_affinity_for_current_gpu() when this is on, + # which improves PCIe/DRAM throughput. NeMo-RL does not call it; users + # who need maximum offload bandwidth may want to set affinity externally. fine_grained_activation_offloading = config["megatron_cfg"].get( "fine_grained_activation_offloading" ) diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 3daabb7d45..64c49dd0c0 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -240,7 +240,9 @@ class MegatronConfig(TypedDict): # Offload specific module activations to CPU to reduce peak GPU memory. # Works with both dense and MoE models. Different from # optimizer_cpu_offload which offloads optimizer states. - # Requires transformer_engine implementation. + # Requires transformer_engine. For TE >= 2.10.0 also requires + # NVTE_CPU_OFFLOAD_V1=1 in the environment (validated by + # Megatron-Bridge at runtime). fine_grained_activation_offloading: NotRequired[bool] # Modules to offload when fine_grained_activation_offloading is True. # Required (no default). Valid values: From aaece7b131786e62671ddae8e1c4598776357e16 Mon Sep 17 00:00:00 2001 From: seonjinn Date: Sat, 23 May 2026 00:39:08 -0700 Subject: [PATCH 13/13] Allow offload_modules to be None in MegatronConfig TypedDict NotRequired[list[str]] caused pydantic to reject `offload_modules: null` in YAML, breaking L1 run_vlm_grpo (and any recipe loading an exemplar megatron config when the feature is off). Wrap with Optional so the exemplar default `null` is accepted; the runtime validation in setup.py already raises if the feature is on with a non-list / empty value. Signed-off-by: seonjinn --- nemo_rl/models/policy/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo_rl/models/policy/__init__.py b/nemo_rl/models/policy/__init__.py index 64c49dd0c0..b336703c86 100644 --- a/nemo_rl/models/policy/__init__.py +++ b/nemo_rl/models/policy/__init__.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Literal, NotRequired, TypedDict, Union +from typing import Any, Literal, NotRequired, Optional, TypedDict, Union from nemo_rl.models.generation.interfaces import GenerationConfig from nemo_rl.utils.checkpoint import PretrainedCheckpointConfig @@ -249,7 +249,7 @@ class MegatronConfig(TypedDict): # "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm", # "expert_fc1", "moe_act". Note: "attn_proj" requires "core_attn". # See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448 - offload_modules: NotRequired[list[str]] + offload_modules: NotRequired[Optional[list[str]]] # Enable grouped GEMM for MoE experts via CUTLASS. Significant throughput # gain when multiple experts are assigned per rank (num_local_experts > 1). # Requires TE >= 1.11.0 for FP8 and Ampere (sm_80) or newer.