NVIDIA-NeMo · zpqiu · May 21, 2026 · May 22, 2026
@@ -6,6 +6,7 @@ grpo:
 loss_fn:
   reference_policy_kl_penalty: 0.0
   use_importance_sampling_correction: true
+  truncated_importance_sampling_type: tis
   truncated_importance_sampling_ratio: 2
 checkpointing:
   checkpoint_dir: results/grpo-glm47-flash-4n8g-automodel

@@ -11,6 +11,7 @@ grpo:
 loss_fn:
   reference_policy_kl_penalty: 0.0
   use_importance_sampling_correction: true
+  truncated_importance_sampling_type: tis
   truncated_importance_sampling_ratio: 2
   ratio_clip_max: 0.28
   ratio_clip_c: 10

@@ -33,9 +33,11 @@ uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 # Only run metrics if the target step is reached
 if [[ $(jq 'to_entries | .[] | select(.key == "train/loss") | .value | keys | map(tonumber) | max' $JSON_METRICS) -ge $MAX_STEPS ]]; then
+    # The step-30 token_mult_prob_error check has high tail variance in this
+    # test; gen_kl_error below already measures policy/generation mismatch.
+    #   'data["train/token_mult_prob_error"]["30"] < 1.1'
     uv run tests/check_metrics.py $JSON_METRICS \
         'median(data["train/token_mult_prob_error"]) < 1.1' \
-        'data["train/token_mult_prob_error"]["30"] < 1.1' \
         'mean(data["train/gen_kl_error"]) < 0.01' \
         'data["train/reward"]["30"] > 0.3' \
         'max(data["validation/accuracy"]) > 0.2' \