NVIDIA-NeMo · jthomson04 · Apr 6, 2026 · Apr 7, 2026 · Apr 7, 2026 · Apr 7, 2026
@@ -38,10 +38,10 @@ grpo:
   seq_logprob_error_threshold: null
 
   async_grpo:
-    enabled: false # Set to true to enable async training mode
+    enabled: true # Set to true to enable async training mode
     # Max age (in training steps) for trajectories used in training
-    max_trajectory_age_steps: 1
-    in_flight_weight_updates: false # Set to true to enable in-flight weight updates
+    max_trajectory_age_steps: 2
+    in_flight_weight_updates: true # Set to true to enable in-flight weight updates
     recompute_kv_cache_after_weight_updates: false # Set to true to recompute kv cache after in-flight-weight-updates
 
 loss_fn:
@@ -55,7 +55,7 @@ loss_fn:
   # (default off) loss formulation improvements (docs/guides/grpo.md#loss)
   use_on_policy_kl_approximation: false
   truncated_importance_sampling_ratio: null
-  use_importance_sampling_correction: false
+  use_importance_sampling_correction: true
   token_level_loss: true
 
 checkpointing:
@@ -234,15 +234,14 @@ policy:
         # Workplace assistant uses 26 tools, so we enable auto_tools.
         # For Nemotron Nano v2, we use the dedicated `nemotron_json` tool parser
         enable_auto_tools: true
-        tool_parser: nemotron_json
+        tool_parser: hermes
+        reasoning_parser: qwen3
     vllm_kwargs:
       compilation_config:
         # when enforce_eager is False, set ++policy.generation.vllm_kwargs.compilation_config.backend=eager for better accuracy,
         # with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
         # for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
         backend: eager
-      # We need the Mamba cache to be set to fp32 for Nemotron Nano v2
-      mamba_ssm_cache_dtype: "float32"
     colocated:
       # true: generation shares training GPUs
       # false: uses dedicated generation resources
@@ -297,10 +296,10 @@ env:
       responses_api_models:
         vllm_model:
           # Disable reasoning!
-          uses_reasoning_parser: false
+          uses_reasoning_parser: true
           extra_body:
             chat_template_kwargs:
-              enable_thinking: false
+              enable_thinking: true
     code_gen:
       resources_servers:
         code_gen:
@@ -328,3 +327,4 @@ logger:
 cluster:
   gpus_per_node: 8
   num_nodes: 1  # Single node by default; set to 2+ for multi-node training
+
@@ -94,7 +94,7 @@ def collect_trajectories(
             input_batch=val_batch,
             tokenizer=tokenizer,
             task_to_env=val_task_to_env,
-            max_seq_len=None,
+            max_seq_len=master_config["policy"]["max_total_sequence_length"],
             generation_config=generation_config,
             max_rollout_turns=None,
             greedy=False,
+1 −1		nemo_gym/global_config.py
+22 −1		nemo_gym/openai_utils.py
+16 −1		responses_api_agents/simple_agent/app.py
+286 −6		responses_api_models/vllm_model/app.py
+37 −0		responses_api_models/vllm_model/tests/test_app.py