fix(opd): use rollout temperature directly

EazyReal · EazyReal · commit ca2ea7fb7843 · 2026-06-30T01:39:42.000-07:00
diff --git a/slime/rollout/on_policy_distillation.py b/slime/rollout/on_policy_distillation.py
@@ -10,11 +10,7 @@ async def reward_func(args, sample, **kwargs):
         # "text": sample.prompt + sample.response,
         "input_ids": sample.tokens,
         "sampling_params": {
-            # Score teacher log-probs at rollout_temperature: SGLang scales
-            # input_token_logprobs by the sampling temperature, and the student
-            # log-probs are temperature-scaled too (get_responses), so the OPD KL is
-            # only consistent when both are at the same temperature.
-            "temperature": getattr(args, "rollout_temperature", 1.0),
+            "temperature": args.rollout_temperature,
             "max_new_tokens": 0,
             "skip_special_tokens": False,
         },