more fix

jthomson04 · jthomson04 · commit 877bb67d7d81 · 2026-04-21T15:25:26.000-07:00
Signed-off-by: jthomson04 &lt;jwillthomson19@gmail.com&gt;
diff --git a/3rdparty/Gym-workspace/Gym b/3rdparty/Gym-workspace/Gym
@@ -1 +1 @@
-Subproject commit 929c85e766d66a55df3fd08e98781fd808edbef8
+Subproject commit 8c08ffaebe75fa19633d68fd4003e957f2840251
diff --git a/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml b/examples/nemo_gym/grpo_workplace_assistant_nemotron_nano_v2_9b.yaml
@@ -38,10 +38,10 @@ grpo:
   seq_logprob_error_threshold: null
 
   async_grpo:
-    enabled: false # Set to true to enable async training mode
+    enabled: true # Set to true to enable async training mode
     # Max age (in training steps) for trajectories used in training
-    max_trajectory_age_steps: 1
-    in_flight_weight_updates: false # Set to true to enable in-flight weight updates
+    max_trajectory_age_steps: 2
+    in_flight_weight_updates: true # Set to true to enable in-flight weight updates
     recompute_kv_cache_after_weight_updates: false # Set to true to recompute kv cache after in-flight-weight-updates
 
 loss_fn:
@@ -55,7 +55,7 @@ loss_fn:
   # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
   use_on_policy_kl_approximation: false
   truncated_importance_sampling_ratio: null
-  use_importance_sampling_correction: false
+  use_importance_sampling_correction: true
   token_level_loss: true
 
 checkpointing:
@@ -234,15 +234,14 @@ policy:
         # Workplace assistant uses 26 tools, so we enable auto_tools.
         # For Nemotron Nano v2, we use the dedicated `nemotron_json` tool parser
         enable_auto_tools: true
-        tool_parser: nemotron_json
+        tool_parser: hermes
+        reasoning_parser: qwen3
     vllm_kwargs:
       compilation_config:
         # when enforce_eager is False, set ++policy.generation.vllm_kwargs.compilation_config.backend=eager for better accuracy,
         # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
         # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
         backend: eager
-      # We need the Mamba cache to be set to fp32 for Nemotron Nano v2
-      mamba_ssm_cache_dtype: "float32"
     colocated:
       # true: generation shares training GPUs
       # false: uses dedicated generation resources
@@ -297,10 +296,10 @@ env:
       responses_api_models:
         vllm_model:
           # Disable reasoning!
-          uses_reasoning_parser: false
+          uses_reasoning_parser: true
           extra_body:
             chat_template_kwargs:
-              enable_thinking: false
+              enable_thinking: true
     code_gen:
       resources_servers:
         code_gen:
@@ -328,3 +327,4 @@ logger:
 cluster:
   gpus_per_node: 8
   num_nodes: 1  # Single node by default; set to 2+ for multi-node training
+
diff --git a/examples/nemo_gym/run_grpo_nemo_gym.py b/examples/nemo_gym/run_grpo_nemo_gym.py
@@ -94,7 +94,7 @@ def collect_trajectories(
             input_batch=val_batch,
             tokenizer=tokenizer,
             task_to_env=val_task_to_env,
-            max_seq_len=None,
+            max_seq_len=master_config["policy"]["max_total_sequence_length"],
             generation_config=generation_config,
             max_rollout_turns=None,
             greedy=False,
diff --git a/nemo_rl/algorithms/async_utils.py b/nemo_rl/algorithms/async_utils.py
@@ -657,7 +657,9 @@ def _run_prompt_group_worker(
                     input_batch=repeated_batch,
                     tokenizer=self.tokenizer,
                     task_to_env=self.task_to_env,
-                    max_seq_len=None,
+                    max_seq_len=self.master_config["policy"][
+                        "max_total_sequence_length"
+                    ],
                     generation_config=generation_config,
                     max_rollout_turns=None,
                     greedy=False,
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
@@ -779,7 +779,7 @@ def init_dynamo():
 
     # if it is not colocated inference, initialize collective communication for update weights
     # Dynamo backend does not support weight updates — skip collective init and refit.
-    if not colocated_inference and backend != "dynamo":
+    if not colocated_inference and backend not in ("dynamo", "vllm"):
         t0 = time.perf_counter()
         ip, port = train_cluster.get_master_address_and_port()
         print(f"Using ip: {ip}, port: {port} for collective communication", flush=True)
@@ -800,7 +800,7 @@ def init_dynamo():
 
     # prepare refit info
     state_dict_info = policy.prepare_refit_info()
-    if policy_generation is not None and backend != "dynamo":
+    if policy_generation is not None and backend not in ("dynamo", "vllm"):
         policy_generation.prepare_refit_info(state_dict_info)
 
     # Calculate total setup time
@@ -1393,7 +1393,7 @@ def grpo_train(
     if policy_generation is None:
         policy_generation = policy  # type: ignore
         NEED_REFIT = False
-    elif master_config["policy"]["generation"]["backend"] == "dynamo":
+    elif master_config["policy"]["generation"]["backend"] in ("dynamo", "vllm"):
         NEED_REFIT = False
     POLICY_GENERATION_STALE = True  # tracks if generation needs a refit before running
     assert policy_generation is not None  # for mypy type check
@@ -1579,7 +1579,9 @@ def grpo_train(
                             input_batch=repeated_batch,
                             tokenizer=tokenizer,
                             task_to_env=task_to_env,
-                            max_seq_len=None,
+                            max_seq_len=master_config["policy"][
+                                "max_total_sequence_length"
+                            ],
                             generation_config=generation_config,
                             max_rollout_turns=None,
                             greedy=False,
@@ -2316,7 +2318,7 @@ def validate(
                     input_batch=val_batch,
                     tokenizer=tokenizer,
                     task_to_env=val_task_to_env,
-                    max_seq_len=None,
+                    max_seq_len=master_config["policy"]["max_total_sequence_length"],
                     generation_config=generation_config,
                     max_rollout_turns=None,
                     greedy=False,
@@ -2489,7 +2491,7 @@ def async_grpo_train(
     if policy_generation is None:
         policy_generation = policy
         NEED_REFIT = False
-    elif master_config["policy"]["generation"]["backend"] == "dynamo":
+    elif master_config["policy"]["generation"]["backend"] in ("dynamo", "vllm"):
         NEED_REFIT = False
     POLICY_GENERATION_STALE = True
     assert policy_generation is not None
@@ -2950,6 +2952,11 @@ def async_grpo_train(
                         weight_version += 1
                         trajectory_collector.set_weight_version.remote(weight_version)
                         trajectory_collector.resume_after_refit.remote()
+                else:
+                    # Advance the trajectory collector's weight version even when refit is skipped
+                    # so that the replay buffer can sample trajectories targeted for subsequent steps.
+                    weight_version += 1
+                    trajectory_collector.set_weight_version.remote(weight_version)
 
                 # Clear logger metrics after each refit (weight sync), starting a new logging cycle
                 if policy_generation is not None:
diff --git a/nemo_rl/distributed/model_utils.py b/nemo_rl/distributed/model_utils.py
@@ -2151,13 +2151,18 @@ def forward(
     # calculate the logprobs for the last token and then return the logprobs
     vocab_start_index = tp_rank * (self.vocab_size // tp_size)
     vocab_end_index = min((tp_rank + 1) * (self.vocab_size // tp_size), self.vocab_size)
-    output_weight_layer = self.output_layer.weight
+    # For models with tied embeddings (e.g. Qwen3), self.output_layer.weight is None —
+    # the real weight lives on the embedding and must be fetched via
+    # shared_embedding_or_output_weight().
+    output_weight_layer = (
+        self.shared_embedding_or_output_weight()
+        if self.share_embeddings_and_output_weights
+        else self.output_layer.weight
+    )
     logprobs = from_parallel_hidden_states_to_logprobs(
         hidden_states,  # .transpose(0, 1).contiguous(),
         output_weight_layer,
-        self.shared_embedding_or_output_weight()
-        if self.share_embeddings_and_output_weights
-        else self.output_layer.weight,
+        output_weight_layer,
         runtime_gather_output,
         labels,
         vocab_start_index=vocab_start_index,
diff --git a/nemo_rl/models/generation/__init__.py b/nemo_rl/models/generation/__init__.py
@@ -44,8 +44,10 @@ def configure_generation_config(
     # vllm setting
     if config["backend"] == "vllm":
         config = cast(VllmConfig, config)
-        # set load_format
-        config["vllm_cfg"]["load_format"] = "auto" if is_eval else "dummy"
+        # set load_format (respect user override if they set it explicitly,
+        # e.g. to force "auto" for benchmarking without a Megatron refit).
+        if config["vllm_cfg"].get("load_format") is None:
+            config["vllm_cfg"]["load_format"] = "auto" if is_eval else "dummy"
         speculative_config = config.get("vllm_kwargs", {}).get("speculative_config")
         if speculative_config:
             # Speculative decoding needs real startup weights unless the draft
diff --git a/nemo_rl/models/generation/dynamo/config.py b/nemo_rl/models/generation/dynamo/config.py
@@ -44,6 +44,8 @@ class DynamoCfg(TypedDict, total=False):
     namespace: str
     enable_planner: bool  # Launch planner + VirtualConnectorClient for autoscaling
     initial_dp_size: int  # Workers at startup (must be <= cluster.world_size() // tp_size)
+    tool_call_parser: str  # Dynamo parser name, or "none" to disable
+    reasoning_parser: str  # Dynamo parser name, or "none" to disable
 
 
 class DynamoVllmConfig(GenerationConfig):
diff --git a/nemo_rl/models/generation/dynamo/dynamo_generation.py b/nemo_rl/models/generation/dynamo/dynamo_generation.py
@@ -326,7 +326,7 @@ def _start_planner(self) -> None:
             "pre_deployment_sweeping_mode": "none",
             "decode_engine_num_gpu": self.tp_size,
             "ttft": 500.0,
-            "itl": 50.0,
+            "itl": 1.0,
             "max_gpu_budget": self._inference_gpu_count,
             "min_endpoint": 1,
             "load_adjustment_interval": 5,
@@ -366,7 +366,8 @@ def _start_frontend(self) -> None:
                 "--router-mode", "kv",
                 "--active-decode-blocks-threshold", "1000.0",
                 "--active-prefill-tokens-threshold", "1000000000000",
-                "--active-prefill-tokens-threshold-frac", "1000.0"
+                "--active-prefill-tokens-threshold-frac", "1000.0",
+                "--router-predict-on-route"
             ],
             env=env,
         )
@@ -543,6 +544,24 @@ def shutdown(self) -> bool:
         self._pool.shutdown()
         return True
 
+    # ------------------------------------------------------------------
+    # Serialization support (for Ray actor pickling in async GRPO)
+    # ------------------------------------------------------------------
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        for attr, _, _ in _SUBPROCESS_REGISTRY:
+            state[attr] = None
+        state["_vc_stop"] = None
+        state["_vc_thread"] = None
+        state["_pool"] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+        self._external = True
+        self._vc_stop = threading.Event()
+
     # ------------------------------------------------------------------
     # Unsupported weight-update methods
     # ------------------------------------------------------------------
diff --git a/nemo_rl/models/generation/dynamo/dynamo_worker.py b/nemo_rl/models/generation/dynamo/dynamo_worker.py
@@ -34,6 +34,16 @@
 from nemo_rl.models.generation.dynamo.config import DynamoVllmConfig
 
 
+def _normalize_parser_name(value: Optional[str], default: Optional[str]) -> Optional[str]:
+    """Normalize parser names from config/env, treating empty/none as disabled."""
+    if value is None:
+        return default
+    normalized = value.strip()
+    if not normalized or normalized.lower() == "none":
+        return None
+    return normalized
+
+
 def _build_vllm_cli_args(
     model_name: str,
     vllm_cfg: dict[str, Any],
@@ -276,10 +286,22 @@ def __init__(
                 kv_events_config_json=kv_events_json,
                 seed=seed,
             ),
-            "--dyn-tool-call-parser", "hermes",
-            "--dyn-reasoning-parser", "qwen3"
         ]
 
+        dynamo_cfg = config.get("dynamo_cfg", {})
+        tool_call_parser = _normalize_parser_name(
+            os.environ.get("DYNAMO_TOOL_CALL_PARSER"),
+            _normalize_parser_name(dynamo_cfg.get("tool_call_parser"), "hermes"),
+        )
+        reasoning_parser = _normalize_parser_name(
+            os.environ.get("DYNAMO_REASONING_PARSER"),
+            _normalize_parser_name(dynamo_cfg.get("reasoning_parser"), "qwen3"),
+        )
+        if tool_call_parser is not None:
+            cmd.extend(["--dyn-tool-call-parser", tool_call_parser])
+        if reasoning_parser is not None:
+            cmd.extend(["--dyn-reasoning-parser", reasoning_parser])
+
         # --- Subprocess environment ---
         env = os.environ.copy()
         env["CUDA_VISIBLE_DEVICES"] = cuda_visible
@@ -315,7 +337,9 @@ def __init__(
         print(
             f"  [DynamoVllmWorker] Launched dynamo.vllm (pid={self._process.pid}, "
             f"CUDA_VISIBLE_DEVICES={cuda_visible}, "
-            f"TP={tp_size})",
+            f"TP={tp_size}, "
+            f"tool_call_parser={tool_call_parser or 'disabled'}, "
+            f"reasoning_parser={reasoning_parser or 'disabled'})",
             flush=True,
         )