-
Notifications
You must be signed in to change notification settings - Fork 452
perf: support fine-grained activation offloading #2279
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
feee53e
0b530c4
d5df80a
f742bf8
e237987
4f4681c
06b4d4a
2b33ad3
5225217
211e31a
502d2dd
da947f8
fcbd22b
254c7d0
625b25f
ed86517
d89ac84
aaece7b
bcccabf
08d121d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -735,6 +735,24 @@ def _apply_performance_config(model_cfg: Any, config: PolicyConfig) -> None: | |
| "Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue." | ||
| ) | ||
|
|
||
| # Megatron validates module names and per-model-type compatibility. | ||
| # Note: Megatron-Bridge's standalone training path also sets NUMA-aware | ||
| # CPU affinity via set_ideal_affinity_for_current_gpu() when this is on, | ||
| # which improves PCIe/DRAM throughput. NeMo-RL does not call it; users | ||
| # who need maximum offload bandwidth may want to set affinity externally. | ||
| fine_grained_activation_offloading = config["megatron_cfg"].get( | ||
| "fine_grained_activation_offloading" | ||
| ) | ||
| if fine_grained_activation_offloading: | ||
|
terrykong marked this conversation as resolved.
|
||
| offload_modules = config["megatron_cfg"].get("offload_modules") | ||
| if not isinstance(offload_modules, list) or not offload_modules: | ||
| raise ValueError( | ||
| "offload_modules must be a non-empty list when " | ||
| "fine_grained_activation_offloading is True." | ||
| ) | ||
| model_cfg.fine_grained_activation_offloading = True | ||
| model_cfg.offload_modules = offload_modules | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Megatron-Bridge calls
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @seonjinn wdyt of this? |
||
|
|
||
|
|
||
| def _validate_optimizer_config(config: PolicyConfig) -> None: | ||
| """Validate optimizer configuration.""" | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1005,6 +1005,99 @@ def test_fp8_param_warning(self): | |||||
| with pytest.warns(UserWarning, match="fp8_param=True sometimes causes NaN"): | ||||||
| _apply_performance_config(model_cfg, config) | ||||||
|
|
||||||
| def test_fine_grained_activation_offloading_enabled(self): | ||||||
| """Test happy path: enabled with non-empty offload_modules list.""" | ||||||
| from nemo_rl.models.megatron.setup import _apply_performance_config | ||||||
|
|
||||||
| model_cfg = MagicMock() | ||||||
| model_cfg.gated_linear_unit = True | ||||||
| offload_modules = ["mlp", "moe_act"] | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit:
Suggested change
|
||||||
| config = { | ||||||
| "megatron_cfg": { | ||||||
| "activation_checkpointing": False, | ||||||
| "apply_rope_fusion": False, | ||||||
| "bias_activation_fusion": False, | ||||||
| "gradient_accumulation_fusion": False, | ||||||
| "fine_grained_activation_offloading": True, | ||||||
| "offload_modules": offload_modules, | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| _apply_performance_config(model_cfg, config) | ||||||
|
|
||||||
| assert model_cfg.fine_grained_activation_offloading is True | ||||||
| assert model_cfg.offload_modules == offload_modules | ||||||
|
|
||||||
| def test_fine_grained_activation_offloading_disabled_skips(self): | ||||||
| """When flag is False (default), no offload attrs should be set.""" | ||||||
| from nemo_rl.models.megatron.setup import _apply_performance_config | ||||||
|
|
||||||
| model_cfg = MagicMock(spec=["gated_linear_unit"]) | ||||||
| model_cfg.gated_linear_unit = True | ||||||
| config = { | ||||||
| "megatron_cfg": { | ||||||
| "activation_checkpointing": False, | ||||||
| "apply_rope_fusion": False, | ||||||
| "bias_activation_fusion": False, | ||||||
| "gradient_accumulation_fusion": False, | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| _apply_performance_config(model_cfg, config) | ||||||
|
|
||||||
| assert not hasattr(model_cfg, "fine_grained_activation_offloading") | ||||||
| assert not hasattr(model_cfg, "offload_modules") | ||||||
|
|
||||||
| @pytest.mark.parametrize( | ||||||
| "offload_modules", | ||||||
| [[], None, "mlp", 42], | ||||||
| ids=["empty_list", "none", "string", "int"], | ||||||
| ) | ||||||
| def test_fine_grained_activation_offloading_invalid_modules_raises( | ||||||
| self, offload_modules | ||||||
| ): | ||||||
| """offload_modules must be a non-empty list when feature is enabled.""" | ||||||
| from nemo_rl.models.megatron.setup import _apply_performance_config | ||||||
|
|
||||||
| model_cfg = MagicMock() | ||||||
| model_cfg.gated_linear_unit = True | ||||||
| config = { | ||||||
| "megatron_cfg": { | ||||||
| "activation_checkpointing": False, | ||||||
| "apply_rope_fusion": False, | ||||||
| "bias_activation_fusion": False, | ||||||
| "gradient_accumulation_fusion": False, | ||||||
| "fine_grained_activation_offloading": True, | ||||||
| "offload_modules": offload_modules, | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| with pytest.raises( | ||||||
| ValueError, match="offload_modules must be a non-empty list" | ||||||
| ): | ||||||
| _apply_performance_config(model_cfg, config) | ||||||
|
|
||||||
| def test_fine_grained_activation_offloading_missing_modules_raises(self): | ||||||
| """When enabled but offload_modules key is absent, defaults to None → raises.""" | ||||||
| from nemo_rl.models.megatron.setup import _apply_performance_config | ||||||
|
|
||||||
| model_cfg = MagicMock() | ||||||
| model_cfg.gated_linear_unit = True | ||||||
| config = { | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: the docstring says "defaults to []" but
Suggested change
|
||||||
| "megatron_cfg": { | ||||||
| "activation_checkpointing": False, | ||||||
| "apply_rope_fusion": False, | ||||||
| "bias_activation_fusion": False, | ||||||
| "gradient_accumulation_fusion": False, | ||||||
| "fine_grained_activation_offloading": True, | ||||||
| } | ||||||
| } | ||||||
|
|
||||||
| with pytest.raises( | ||||||
| ValueError, match="offload_modules must be a non-empty list" | ||||||
| ): | ||||||
| _apply_performance_config(model_cfg, config) | ||||||
|
|
||||||
| def test_recompute_granularity_full_explicit(self): | ||||||
| """granularity='full' sets uniform method with 1 layer.""" | ||||||
| from nemo_rl.models.megatron.setup import _apply_performance_config | ||||||
|
|
||||||
Uh oh!
There was an error while loading. Please reload this page.