diff --git a/docs/design-docs/nemo-gym-integration.md b/docs/design-docs/nemo-gym-integration.md index 33e324547b..c83ae276d3 100644 --- a/docs/design-docs/nemo-gym-integration.md +++ b/docs/design-docs/nemo-gym-integration.md @@ -1,6 +1,6 @@ # NeMo Gym Integration -This document describes how NeMo RL integrates with [NeMo Gym](https://docs.nvidia.com/nemo/gym/latest/index.html) for multi-step and multi-turn reinforcement learning training. +This document describes how NeMo RL integrates with [NeMo Gym](https://docs.nvidia.com/nemo/gym/v0.2.1/index.html) for multi-step and multi-turn reinforcement learning training. ## Overview @@ -181,7 +181,7 @@ sequenceDiagram GRPO->>Policy: Compute loss and train ``` -> **NeMo Gym server types** (see [Core Components](https://docs.nvidia.com/nemo/gym/latest/about/concepts/core-components.html)): +> **NeMo Gym server types** (see [Core Components](https://docs.nvidia.com/nemo/gym/v0.2.1/about/concepts/core-components/)): > - **Agent Server**: Orchestrates the rollout loop > - **Model Server**: HTTP proxy to vLLM; translates Responses API ↔ Chat Completions > - **Resource Server**: Provides tools and rewards @@ -254,4 +254,4 @@ Token IDs are extracted at the NeMo RL vLLM layer via the `/tokenize` endpoint. - Tokenization matches the exact model and tokenizer used for generation - No re-tokenization drift between generation and training -For details on on-policy token ID handling, see {doc}`../guides/environments` and the [NeMo Gym on-policy corrections documentation](https://docs.nvidia.com/nemo/gym/latest/contribute/rl-framework-integration/openai-compatible-http-server-on-policy-correction.html). +For details on on-policy token ID handling, see {doc}`../guides/environments` and the [NeMo Gym on-policy corrections documentation](https://docs.nvidia.com/nemo/gym/v0.2.1/contribute/rl-framework-integration/openai-compatible-http-server-on-policy-correction.html). diff --git a/nemo_rl/algorithms/rm.py b/nemo_rl/algorithms/rm.py index 80cfc51bc9..e11bc813ab 100644 --- a/nemo_rl/algorithms/rm.py +++ b/nemo_rl/algorithms/rm.py @@ -116,6 +116,21 @@ def setup( # Extract individual configs for easier access policy_config = master_config.policy + + # TODO(https://github.com/NVIDIA-NeMo/RL/issues/2482): remove once CP is supported for RM training. + dtensor_cfg = policy_config.get("dtensor_cfg", {}) + if ( + dtensor_cfg.get("enabled", False) + and dtensor_cfg.get("context_parallel_size", 1) > 1 + ): + raise ValueError( + "Context parallelism (context_parallel_size > 1) is not supported for reward model " + "training on the DTensor backend. The log_sigmoid operator used in the RM loss does " + "not have a DTensor sharding strategy registered for CP meshes. " + "Please set policy.dtensor_cfg.context_parallel_size=1. " + "See https://github.com/NVIDIA-NeMo/RL/issues/2482 for tracking." + ) + data_config = master_config.data rm_config = master_config.rm logger_config = master_config.logger diff --git a/tests/unit/algorithms/test_rm.py b/tests/unit/algorithms/test_rm.py index d00b31a1aa..dc922cc659 100644 --- a/tests/unit/algorithms/test_rm.py +++ b/tests/unit/algorithms/test_rm.py @@ -19,7 +19,7 @@ from torchdata.stateful_dataloader import StatefulDataLoader from nemo_rl.algorithms.loss import PreferenceLossFn -from nemo_rl.algorithms.rm import MasterConfig, _default_rm_save_state, rm_train +from nemo_rl.algorithms.rm import MasterConfig, _default_rm_save_state, rm_train, setup @pytest.fixture @@ -125,6 +125,69 @@ def val_iter(self): } +def test_context_parallel_rejected_for_dtensor_rm(): + """Test that context_parallel_size > 1 raises ValueError for DTensor RM training. + + TODO(https://github.com/NVIDIA-NeMo/RL/issues/2482): remove when CP is supported for RM. + """ + config = MasterConfig.model_construct( + **{ + "policy": { + "dtensor_cfg": { + "enabled": True, + "context_parallel_size": 2, + "tensor_parallel_size": 1, + "sequence_parallel": False, + "activation_checkpointing": False, + "cpu_offload": False, + }, + }, + "rm": {"seed": 42}, + "data": {}, + "logger": {}, + "cluster": {}, + "checkpointing": {}, + } + ) + with pytest.raises( + ValueError, + match="Context parallelism.*is not supported for reward model training", + ): + setup(config, MagicMock(), MagicMock(), {}) + + +def test_context_parallel_allowed_when_one(): + """Test that context_parallel_size=1 does not raise for DTensor RM training. + + We verify the CP check passes by confirming the error comes from a later + setup stage, not from our validation. + + TODO(https://github.com/NVIDIA-NeMo/RL/issues/2482): remove when CP is supported for RM. + """ + config = MasterConfig.model_construct( + **{ + "policy": { + "dtensor_cfg": { + "enabled": True, + "context_parallel_size": 1, + "tensor_parallel_size": 1, + "sequence_parallel": False, + "activation_checkpointing": False, + "cpu_offload": False, + }, + }, + "rm": {"seed": 42}, + "data": {}, + "logger": {}, + "cluster": {}, + "checkpointing": {}, + } + ) + with pytest.raises(Exception) as excinfo: + setup(config, MagicMock(), MagicMock(), {}) + assert "Context parallelism" not in str(excinfo.value) + + def test_exit_on_max_steps(mock_components): """Test that training loop exits when max_num_steps is reached""" # Set max steps to 12, which is less than len(train_dataloader) * max_num_epochs