From d3187097558f536f02cf5d80a6cd6c698236005a Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Wed, 13 May 2026 00:23:15 -0700 Subject: [PATCH 1/3] fix(rm): raise clear error when context parallelism is used with DTensor RM training Context parallelism (context_parallel_size > 1) is not supported for reward model training on the DTensor backend because the log_sigmoid operator lacks a DTensor sharding strategy for CP meshes. Instead of letting users hit cryptic runtime errors, raise a clear ValueError during setup with a link to the tracking issue. Signed-off-by: Terry Kong --- nemo_rl/algorithms/rm.py | 15 ++++++++ tests/unit/algorithms/test_rm.py | 65 +++++++++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 1 deletion(-) diff --git a/nemo_rl/algorithms/rm.py b/nemo_rl/algorithms/rm.py index 80cfc51bc9..e11bc813ab 100644 --- a/nemo_rl/algorithms/rm.py +++ b/nemo_rl/algorithms/rm.py @@ -116,6 +116,21 @@ def setup( # Extract individual configs for easier access policy_config = master_config.policy + + # TODO(https://github.com/NVIDIA-NeMo/RL/issues/2482): remove once CP is supported for RM training. + dtensor_cfg = policy_config.get("dtensor_cfg", {}) + if ( + dtensor_cfg.get("enabled", False) + and dtensor_cfg.get("context_parallel_size", 1) > 1 + ): + raise ValueError( + "Context parallelism (context_parallel_size > 1) is not supported for reward model " + "training on the DTensor backend. The log_sigmoid operator used in the RM loss does " + "not have a DTensor sharding strategy registered for CP meshes. " + "Please set policy.dtensor_cfg.context_parallel_size=1. " + "See https://github.com/NVIDIA-NeMo/RL/issues/2482 for tracking." + ) + data_config = master_config.data rm_config = master_config.rm logger_config = master_config.logger diff --git a/tests/unit/algorithms/test_rm.py b/tests/unit/algorithms/test_rm.py index d00b31a1aa..dc922cc659 100644 --- a/tests/unit/algorithms/test_rm.py +++ b/tests/unit/algorithms/test_rm.py @@ -19,7 +19,7 @@ from torchdata.stateful_dataloader import StatefulDataLoader from nemo_rl.algorithms.loss import PreferenceLossFn -from nemo_rl.algorithms.rm import MasterConfig, _default_rm_save_state, rm_train +from nemo_rl.algorithms.rm import MasterConfig, _default_rm_save_state, rm_train, setup @pytest.fixture @@ -125,6 +125,69 @@ def val_iter(self): } +def test_context_parallel_rejected_for_dtensor_rm(): + """Test that context_parallel_size > 1 raises ValueError for DTensor RM training. + + TODO(https://github.com/NVIDIA-NeMo/RL/issues/2482): remove when CP is supported for RM. + """ + config = MasterConfig.model_construct( + **{ + "policy": { + "dtensor_cfg": { + "enabled": True, + "context_parallel_size": 2, + "tensor_parallel_size": 1, + "sequence_parallel": False, + "activation_checkpointing": False, + "cpu_offload": False, + }, + }, + "rm": {"seed": 42}, + "data": {}, + "logger": {}, + "cluster": {}, + "checkpointing": {}, + } + ) + with pytest.raises( + ValueError, + match="Context parallelism.*is not supported for reward model training", + ): + setup(config, MagicMock(), MagicMock(), {}) + + +def test_context_parallel_allowed_when_one(): + """Test that context_parallel_size=1 does not raise for DTensor RM training. + + We verify the CP check passes by confirming the error comes from a later + setup stage, not from our validation. + + TODO(https://github.com/NVIDIA-NeMo/RL/issues/2482): remove when CP is supported for RM. + """ + config = MasterConfig.model_construct( + **{ + "policy": { + "dtensor_cfg": { + "enabled": True, + "context_parallel_size": 1, + "tensor_parallel_size": 1, + "sequence_parallel": False, + "activation_checkpointing": False, + "cpu_offload": False, + }, + }, + "rm": {"seed": 42}, + "data": {}, + "logger": {}, + "cluster": {}, + "checkpointing": {}, + } + ) + with pytest.raises(Exception) as excinfo: + setup(config, MagicMock(), MagicMock(), {}) + assert "Context parallelism" not in str(excinfo.value) + + def test_exit_on_max_steps(mock_components): """Test that training loop exits when max_num_steps is reached""" # Set max steps to 12, which is less than len(train_dataloader) * max_num_epochs From 9c93d8d507a4ea46cd747d8a8ad4571a524bf236 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Thu, 14 May 2026 23:23:36 -0700 Subject: [PATCH 2/3] ci: ignore NeMo Gym docs in sphinx linkcheck The NeMo Gym docs URL returns 404, causing sphinx-build CI to fail. Add the URL pattern to linkcheck_ignore since the external docs site is not under our control. Signed-off-by: Terry Kong --- docs/conf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/conf.py b/docs/conf.py index 99a3d3f5ae..2995d7d26e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -117,6 +117,7 @@ linkcheck_ignore = [ ".*github\\.com.*", ".*githubusercontent\\.com.*", + "https://docs\\.nvidia\\.com/nemo/gym/.*", ] # PyTorch docs anchor IDs change between stable versions; verify the page From 4c24d9eaab1cf4df86295ae4bbf05c5aab9cf2e0 Mon Sep 17 00:00:00 2001 From: Terry Kong Date: Fri, 15 May 2026 00:26:17 -0700 Subject: [PATCH 3/3] fix(docs): pin gym doc links to v0.2.1 instead of blanket-ignoring Replace the blanket linkcheck_ignore for all NeMo Gym docs with pinned v0.2.1 URLs so linkcheck still validates them. Signed-off-by: Terry Kong --- docs/conf.py | 1 - docs/design-docs/nemo-gym-integration.md | 6 +++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 2995d7d26e..99a3d3f5ae 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -117,7 +117,6 @@ linkcheck_ignore = [ ".*github\\.com.*", ".*githubusercontent\\.com.*", - "https://docs\\.nvidia\\.com/nemo/gym/.*", ] # PyTorch docs anchor IDs change between stable versions; verify the page diff --git a/docs/design-docs/nemo-gym-integration.md b/docs/design-docs/nemo-gym-integration.md index 33e324547b..c83ae276d3 100644 --- a/docs/design-docs/nemo-gym-integration.md +++ b/docs/design-docs/nemo-gym-integration.md @@ -1,6 +1,6 @@ # NeMo Gym Integration -This document describes how NeMo RL integrates with [NeMo Gym](https://docs.nvidia.com/nemo/gym/latest/index.html) for multi-step and multi-turn reinforcement learning training. +This document describes how NeMo RL integrates with [NeMo Gym](https://docs.nvidia.com/nemo/gym/v0.2.1/index.html) for multi-step and multi-turn reinforcement learning training. ## Overview @@ -181,7 +181,7 @@ sequenceDiagram GRPO->>Policy: Compute loss and train ``` -> **NeMo Gym server types** (see [Core Components](https://docs.nvidia.com/nemo/gym/latest/about/concepts/core-components.html)): +> **NeMo Gym server types** (see [Core Components](https://docs.nvidia.com/nemo/gym/v0.2.1/about/concepts/core-components/)): > - **Agent Server**: Orchestrates the rollout loop > - **Model Server**: HTTP proxy to vLLM; translates Responses API ↔ Chat Completions > - **Resource Server**: Provides tools and rewards @@ -254,4 +254,4 @@ Token IDs are extracted at the NeMo RL vLLM layer via the `/tokenize` endpoint. - Tokenization matches the exact model and tokenizer used for generation - No re-tokenization drift between generation and training -For details on on-policy token ID handling, see {doc}`../guides/environments` and the [NeMo Gym on-policy corrections documentation](https://docs.nvidia.com/nemo/gym/latest/contribute/rl-framework-integration/openai-compatible-http-server-on-policy-correction.html). +For details on on-policy token ID handling, see {doc}`../guides/environments` and the [NeMo Gym on-policy corrections documentation](https://docs.nvidia.com/nemo/gym/v0.2.1/contribute/rl-framework-integration/openai-compatible-http-server-on-policy-correction.html).