Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ grpo:
seq_logprob_error_threshold: null

async_grpo:
enabled: false # Set to true to enable async training mode
enabled: true # Set to true to enable async training mode
# Max age (in training steps) for trajectories used in training
max_trajectory_age_steps: 1
in_flight_weight_updates: false # Set to true to enable in-flight weight updates
max_trajectory_age_steps: 2
in_flight_weight_updates: true # Set to true to enable in-flight weight updates
recompute_kv_cache_after_weight_updates: false # Set to true to recompute kv cache after in-flight-weight-updates

loss_fn:
Expand All @@ -55,7 +55,7 @@ loss_fn:
# (default off) loss formulation improvements (docs/guides/grpo.md#loss)
use_on_policy_kl_approximation: false
truncated_importance_sampling_ratio: null
use_importance_sampling_correction: false
use_importance_sampling_correction: true
token_level_loss: true

checkpointing:
Expand Down Expand Up @@ -234,15 +234,14 @@ policy:
# Workplace assistant uses 26 tools, so we enable auto_tools.
# For Nemotron Nano v2, we use the dedicated `nemotron_json` tool parser
enable_auto_tools: true
tool_parser: nemotron_json
tool_parser: hermes
reasoning_parser: qwen3
vllm_kwargs:
compilation_config:
# when enforce_eager is False, set ++policy.generation.vllm_kwargs.compilation_config.backend=eager for better accuracy,
# with the flag, vllm will use the custom CUDA kernels instead of the Triton kernels generated by torch.compile
# for more details, see convergence issue https://github.com/NVIDIA-NeMo/RL/issues/998
backend: eager
# We need the Mamba cache to be set to fp32 for Nemotron Nano v2
mamba_ssm_cache_dtype: "float32"
colocated:
# true: generation shares training GPUs
# false: uses dedicated generation resources
Expand Down Expand Up @@ -297,10 +296,10 @@ env:
responses_api_models:
vllm_model:
# Disable reasoning!
uses_reasoning_parser: false
uses_reasoning_parser: true
extra_body:
chat_template_kwargs:
enable_thinking: false
enable_thinking: true
code_gen:
resources_servers:
code_gen:
Expand Down Expand Up @@ -328,3 +327,4 @@ logger:
cluster:
gpus_per_node: 8
num_nodes: 1 # Single node by default; set to 2+ for multi-node training

2 changes: 1 addition & 1 deletion examples/nemo_gym/run_grpo_nemo_gym.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def collect_trajectories(
input_batch=val_batch,
tokenizer=tokenizer,
task_to_env=val_task_to_env,
max_seq_len=None,
max_seq_len=master_config["policy"]["max_total_sequence_length"],
generation_config=generation_config,
max_rollout_turns=None,
greedy=False,
Expand Down
Loading
Loading