Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
feee53e
Add fine-grained activation offloading for Megatron policy
seonjinn Apr 17, 2026
0b530c4
Update nemo_rl/models/policy/__init__.py
seonjinn Apr 23, 2026
d5df80a
Update nemo_rl/models/megatron/setup.py
seonjinn Apr 23, 2026
f742bf8
Merge remote-tracking branch 'origin/main' into sj/fine-grained-activ…
seonjinn May 14, 2026
e237987
fix: remove stray paren in setup.py raising ValueError
seonjinn May 14, 2026
4f4681c
Merge remote-tracking branch 'origin/main' into sj/fine-grained-activ…
seonjinn May 15, 2026
06b4d4a
fix: pin NeMo Gym docs URL to v0.2.1 (latest 404)
seonjinn May 15, 2026
2b33ad3
Merge branch 'main' into sj/fine-grained-activation-offload
seonjinn May 15, 2026
5225217
test: add unit tests for fine_grained_activation_offloading branch
seonjinn May 16, 2026
211e31a
Update nemo_rl/models/megatron/setup.py
seonjinn May 19, 2026
502d2dd
Update nemo_rl/models/megatron/setup.py
seonjinn May 19, 2026
da947f8
Update tests/unit/models/megatron/test_megatron_setup.py
seonjinn May 19, 2026
fcbd22b
Merge main into sj/fine-grained-activation-offload
seonjinn May 19, 2026
254c7d0
Fix syntax error from main merge in offload test
seonjinn May 19, 2026
625b25f
Merge branch 'main' into sj/fine-grained-activation-offload
seonjinn May 22, 2026
ed86517
Address review comments on fine_grained_activation_offloading
seonjinn May 23, 2026
d89ac84
Address remaining terrykong review comments
seonjinn May 23, 2026
aaece7b
Allow offload_modules to be None in MegatronConfig TypedDict
seonjinn May 23, 2026
bcccabf
Merge branch 'main' into sj/fine-grained-activation-offload
seonjinn May 23, 2026
08d121d
Merge branch 'main' into sj/fine-grained-activation-offload
seonjinn May 26, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions examples/configs/distillation_math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,15 @@ policy: &POLICY_BASE
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
empty_unused_memory_level: 0
activation_checkpointing: false
# Offload specific module activations to CPU. Works for both dense and MoE
# models. Requires transformer_engine. Different from optimizer_cpu_offload
# which offloads optimizer states.
fine_grained_activation_offloading: false
# Modules to offload when fine_grained_activation_offloading is true.
# Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
# "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
# See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
offload_modules: null
# recompute_granularity controls activation checkpointing depth.
# "full": recompute all activations (default, max memory savings).
# "selective": recompute only specific modules (see recompute_modules).
Expand Down
9 changes: 9 additions & 0 deletions examples/configs/distillation_math_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,15 @@ policy: &POLICY_BASE
enabled: true
empty_unused_memory_level: 0
activation_checkpointing: false
# Offload specific module activations to CPU. Works for both dense and MoE
# models. Requires transformer_engine. Different from optimizer_cpu_offload
# which offloads optimizer states.
fine_grained_activation_offloading: false
# Modules to offload when fine_grained_activation_offloading is true.
# Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
# "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
# See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
offload_modules: null
# recompute_granularity controls activation checkpointing depth.
# "full": recompute all activations (default, max memory savings).
# "selective": recompute only specific modules (see recompute_modules).
Expand Down
9 changes: 9 additions & 0 deletions examples/configs/dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,15 @@ policy:
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
empty_unused_memory_level: 1
activation_checkpointing: false
# Offload specific module activations to CPU. Works for both dense and MoE
# models. Requires transformer_engine. Different from optimizer_cpu_offload
# which offloads optimizer states.
fine_grained_activation_offloading: false
# Modules to offload when fine_grained_activation_offloading is true.
# Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
# "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
# See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
offload_modules: null
# recompute_granularity controls activation checkpointing depth.
# "full": recompute all activations (default, max memory savings).
# "selective": recompute only specific modules (see recompute_modules).
Expand Down
9 changes: 9 additions & 0 deletions examples/configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,15 @@ policy:
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
activation_checkpointing: false
# Offload specific module activations to CPU. Works for both dense and MoE
# models. Requires transformer_engine. Different from optimizer_cpu_offload
# which offloads optimizer states.
fine_grained_activation_offloading: false
# Modules to offload when fine_grained_activation_offloading is true.
# Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
# "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
# See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
offload_modules: null
# recompute_granularity controls activation checkpointing depth.
# "full": recompute all activations (default, max memory savings).
# "selective": recompute only specific modules (see recompute_modules).
Expand Down
9 changes: 9 additions & 0 deletions examples/configs/grpo_math_1B_megatron.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,15 @@ policy:
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
activation_checkpointing: false
# Offload specific module activations to CPU. Works for both dense and MoE
# models. Requires transformer_engine. Different from optimizer_cpu_offload
# which offloads optimizer states.
fine_grained_activation_offloading: false
# Modules to offload when fine_grained_activation_offloading is true.
# Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
# "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
# See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
offload_modules: null
# recompute_granularity controls activation checkpointing depth.
# "full": recompute all activations (default, max memory savings).
# "selective": recompute only specific modules (see recompute_modules).
Expand Down
9 changes: 9 additions & 0 deletions examples/configs/sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,15 @@ policy:
env_vars: {}
empty_unused_memory_level: 1
activation_checkpointing: false
# Offload specific module activations to CPU. Works for both dense and MoE
# models. Requires transformer_engine. Different from optimizer_cpu_offload
# which offloads optimizer states.
fine_grained_activation_offloading: false
# Modules to offload when fine_grained_activation_offloading is true.
# Valid options: ["attn_norm", "qkv_linear", "core_attn", "attn_proj",
# "mlp_norm", "expert_fc1", "moe_act"]. "attn_proj" requires "core_attn".
# See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
offload_modules: null
# recompute_granularity controls activation checkpointing depth.
# "full": recompute all activations (default, max memory savings).
# "selective": recompute only specific modules (see recompute_modules).
Expand Down
18 changes: 18 additions & 0 deletions nemo_rl/models/megatron/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,24 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None:
"Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue."
)

# Megatron validates module names and per-model-type compatibility.
# Note: Megatron-Bridge's standalone training path also sets NUMA-aware
# CPU affinity via set_ideal_affinity_for_current_gpu() when this is on,
# which improves PCIe/DRAM throughput. NeMo-RL does not call it; users
# who need maximum offload bandwidth may want to set affinity externally.
fine_grained_activation_offloading = config["megatron_cfg"].get(
"fine_grained_activation_offloading"
)
Comment thread
seonjinn marked this conversation as resolved.
if fine_grained_activation_offloading:
Comment thread
terrykong marked this conversation as resolved.
offload_modules = config["megatron_cfg"].get("offload_modules")
if not isinstance(offload_modules, list) or not offload_modules:
raise ValueError(
"offload_modules must be a non-empty list when "
"fine_grained_activation_offloading is True."
)
model_cfg.fine_grained_activation_offloading = True
model_cfg.offload_modules = offload_modules

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nemo_rl/models/megatron/setup.py:533

Megatron-Bridge calls set_ideal_affinity_for_current_gpu() when this feature is enabled to optimize PCIe/DRAM transfer throughput via NUMA-aware CPU affinity. NeMo-RL doesn't call this, so users may not get the full performance benefit. Worth considering whether to add it here or document the gap.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@seonjinn wdyt of this?



def _validate_optimizer_config(config: PolicyConfig) -> None:
"""Validate optimizer configuration."""
Expand Down
15 changes: 14 additions & 1 deletion nemo_rl/models/policy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Any, Literal, NotRequired, TypedDict, Union
from typing import Any, Literal, NotRequired, Optional, TypedDict, Union

from nemo_rl.models.generation.interfaces import GenerationConfig
from nemo_rl.utils.checkpoint import PretrainedCheckpointConfig
Expand Down Expand Up @@ -247,6 +247,19 @@ class MegatronConfig(TypedDict):
moe_token_dispatcher_type: str
# Can be used only with 'alltoall' token dispatcher
moe_shared_expert_overlap: bool
# Offload specific module activations to CPU to reduce peak GPU memory.
# Works with both dense and MoE models. Different from
# optimizer_cpu_offload which offloads optimizer states.
# Requires transformer_engine. For TE >= 2.10.0 also requires
# NVTE_CPU_OFFLOAD_V1=1 in the environment (validated by
# Megatron-Bridge at runtime).
fine_grained_activation_offloading: NotRequired[bool]
# Modules to offload when fine_grained_activation_offloading is True.
# Required (no default). Valid values:
# "attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm",
# "expert_fc1", "moe_act". Note: "attn_proj" requires "core_attn".
# See: https://github.com/NVIDIA/Megatron-LM/blob/d30c3ae5469fe3f6a64d4fd2e63b6e7f7844ea81/megatron/core/transformer/transformer_config.py#L1440-L1448
offload_modules: NotRequired[Optional[list[str]]]
# Enable grouped GEMM for MoE experts via CUTLASS. Significant throughput
# gain when multiple experts are assigned per rank (num_local_experts > 1).
# Requires TE >= 1.11.0 for FP8 and Ampere (sm_80) or newer.
Expand Down
93 changes: 93 additions & 0 deletions tests/unit/models/megatron/test_megatron_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -1005,6 +1005,99 @@ def test_fp8_param_warning(self):
with pytest.warns(UserWarning, match="fp8_param=True sometimes causes NaN"):
_apply_performance_config(model_cfg, config)

def test_fine_grained_activation_offloading_enabled(self):
"""Test happy path: enabled with non-empty offload_modules list."""
from nemo_rl.models.megatron.setup import _apply_performance_config

model_cfg = MagicMock()
model_cfg.gated_linear_unit = True
offload_modules = ["mlp", "moe_act"]

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: "mlp" is not in the documented valid options ("attn_norm", "qkv_linear", "core_attn", "attn_proj", "mlp_norm", "expert_fc1", "moe_act"). While Megatron validates downstream, the happy-path test should use valid module names to avoid confusion.

Suggested change
offload_modules = ["mlp", "moe_act"]
offload_modules = ["mlp_norm", "moe_act"]

config = {
"megatron_cfg": {
"activation_checkpointing": False,
"apply_rope_fusion": False,
"bias_activation_fusion": False,
"gradient_accumulation_fusion": False,
"fine_grained_activation_offloading": True,
"offload_modules": offload_modules,
}
}

_apply_performance_config(model_cfg, config)

assert model_cfg.fine_grained_activation_offloading is True
assert model_cfg.offload_modules == offload_modules

def test_fine_grained_activation_offloading_disabled_skips(self):
"""When flag is False (default), no offload attrs should be set."""
from nemo_rl.models.megatron.setup import _apply_performance_config

model_cfg = MagicMock(spec=["gated_linear_unit"])
model_cfg.gated_linear_unit = True
config = {
"megatron_cfg": {
"activation_checkpointing": False,
"apply_rope_fusion": False,
"bias_activation_fusion": False,
"gradient_accumulation_fusion": False,
}
}

_apply_performance_config(model_cfg, config)

assert not hasattr(model_cfg, "fine_grained_activation_offloading")
assert not hasattr(model_cfg, "offload_modules")

@pytest.mark.parametrize(
"offload_modules",
[[], None, "mlp", 42],
ids=["empty_list", "none", "string", "int"],
)
def test_fine_grained_activation_offloading_invalid_modules_raises(
self, offload_modules
):
"""offload_modules must be a non-empty list when feature is enabled."""
from nemo_rl.models.megatron.setup import _apply_performance_config

model_cfg = MagicMock()
model_cfg.gated_linear_unit = True
config = {
"megatron_cfg": {
"activation_checkpointing": False,
"apply_rope_fusion": False,
"bias_activation_fusion": False,
"gradient_accumulation_fusion": False,
"fine_grained_activation_offloading": True,
"offload_modules": offload_modules,
}
}

with pytest.raises(
ValueError, match="offload_modules must be a non-empty list"
):
_apply_performance_config(model_cfg, config)

def test_fine_grained_activation_offloading_missing_modules_raises(self):
"""When enabled but offload_modules key is absent, defaults to None → raises."""
from nemo_rl.models.megatron.setup import _apply_performance_config

model_cfg = MagicMock()
model_cfg.gated_linear_unit = True
config = {

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: the docstring says "defaults to []" but .get("offload_modules") returns None when the key is absent, not []. The test is correct — None fails the isinstance(..., list) check — but the docstring is misleading.

Suggested change
config = {
"""When enabled but offload_modules key is absent, defaults to None → raises."""

"megatron_cfg": {
"activation_checkpointing": False,
"apply_rope_fusion": False,
"bias_activation_fusion": False,
"gradient_accumulation_fusion": False,
"fine_grained_activation_offloading": True,
}
}

with pytest.raises(
ValueError, match="offload_modules must be a non-empty list"
):
_apply_performance_config(model_cfg, config)

def test_recompute_granularity_full_explicit(self):
"""granularity='full' sets uniform method with 1 layer."""
from nemo_rl.models.megatron.setup import _apply_performance_config
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/reference_configs/distillation_math.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,8 @@ policy: &POLICY_BASE
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
empty_unused_memory_level: 0
activation_checkpointing: false
fine_grained_activation_offloading: false
offload_modules: null
recompute_granularity: "full"
recompute_modules: null
converter_type: "Qwen3ForCausalLM"
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/reference_configs/dpo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ policy:
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
empty_unused_memory_level: 1
activation_checkpointing: false
fine_grained_activation_offloading: false
offload_modules: null
recompute_granularity: "full"
recompute_modules: null
tensor_model_parallel_size: 2
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/reference_configs/grpo_math_1B.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ policy:
force_reconvert_from_hf: False # Set to True to force reconvert of the model from Hugging Face
empty_unused_memory_level: 1 # 1 is the minimum recommendation for RL since we almost always need to offload before beginning generation. Setting to 0 is faster, but you are more likely to run out of GPU memory.
activation_checkpointing: false
fine_grained_activation_offloading: false
offload_modules: null
recompute_granularity: "full"
recompute_modules: null
converter_type: "Qwen2ForCausalLM"
Expand Down
2 changes: 2 additions & 0 deletions tests/unit/reference_configs/sft.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,8 @@ policy:
env_vars: {}
empty_unused_memory_level: 1
activation_checkpointing: false
fine_grained_activation_offloading: false
offload_modules: null
recompute_granularity: "full"
recompute_modules: null
tensor_model_parallel_size: 1
Expand Down
Loading