[Feat] Add on-policy distillation support (#964)

HwVanICI · gemini-code-assist[bot] · root · commit 3fb99a03ce4f · 2026-03-12T02:45:21.000Z
* [feat] Add on-policy knowledge distillation support

* Correct file extension

* Delete comment on distill_loss_weight

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;

* explain RKL and Joint loss

* Add reference to global README

* teacher config refactoring

* refactor: remove redundant stats_tracker denominator logging for KD loss

---------

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/README.md b/README.md
@@ -178,6 +178,7 @@ All RL algorithms support both asynchronous and synchronous versions by setting
 | **M2PO**                 | [📖 Docs](docs/algorithms/m2po.md)        | [📄 Paper](https://arxiv.org/abs/2510.01161)   | [🔗 GSM8K Example](examples/math/gsm8k_m2po.yaml)            |
 | **RLHF Reward Modeling** | -                                         | -                                              | [🔗 RLHF Example](examples/alignment/)                       |
 | **SFT**                  | -                                         | -                                              | [🔗 GSM8K Example](examples/math/gsm8k_sft.py)               |
+| **Distillation**                  | [📖 Docs](docs/en/algorithms/distillation.md)                                         |  [📄 Paper](https://arxiv.org/pdf/2506.02208)                                              | [🔗 GSM8K Example](examples/distillation/gsm8k_grpo_distill.yaml)               |
 
 ### Models
 
diff --git a/areal/api/cli_args.py b/areal/api/cli_args.py
@@ -2145,6 +2145,23 @@ class RWConfig(BaseExperimentConfig):
     actor: TrainEngineConfig = field(default_factory=TrainEngineConfig)
 
 
+@dataclass
+class TeacherConfig(PPOActorConfig):
+    allocation_mode: str = field(
+        default="",
+        metadata={"help": "Pattern-based GPU parallel strategy allocation mode. "},
+    )
+    rl_loss_weight: float = field(
+        default=1.0,
+        metadata={"help": "RL loss weight"},
+    )
+
+    distill_loss_weight: float = field(
+        default=0.005,
+        metadata={"help": "Distillation loss weight"},
+    )
+
+
 @dataclass
 class PPOConfig(BaseExperimentConfig):
     """Configuration for Proximal Policy Optimization (PPO) reinforcement learning experiments."""
@@ -2162,6 +2179,17 @@ class PPOConfig(BaseExperimentConfig):
     actor: PPOActorConfig = field(default_factory=PPOActorConfig)
     ref: PPOActorConfig | None = field(default=None)
     critic: PPOCriticConfig | None = field(default=None)
+    teacher: TeacherConfig | None = field(
+        default=None,
+        metadata={
+            "help": (
+                "Optional teacher model configuration used for on-policy "
+                "distillation during PPO training. If provided, the actor "
+                "may be trained to match the teacher in addition to the "
+                "standard PPO objective."
+            )
+        },
+    )
     dynamic_bs: bool = field(
         default=False,
         metadata={
diff --git a/areal/trainer/ppo/actor.py b/areal/trainer/ppo/actor.py
@@ -430,6 +430,38 @@ def grpo_loss_fn(
             behave_imp_weight_mode=behave_imp_weight_mode,
         )
 
+    # Joint Distillation KL Loss
+    teacher_logp = input_data.get("teacher_logp")
+    rkl_stat = None
+    if teacher_logp is not None:
+        # Coefficients for RL and Knowledge Distillation
+        rl_loss_weight = input_data.get("rl_loss_weight", 1.0)
+        distill_loss_weight = input_data.get("distill_loss_weight", 0.005)
+
+        teacher_logp = (
+            teacher_logp.detach()
+        )  # detach to prevent gradient backprop to teacher
+
+        if rl_loss_weight == 0:
+            # Pure KD using reverse KL (importance-sampling)
+            rkl_reward = teacher_logp - logprobs.detach()
+            importance_weight = torch.exp(logprobs - old_logp)
+
+            rkl_weighted_term = importance_weight * rkl_reward * loss_mask
+
+            kd_coef = -1 * distill_loss_weight
+            loss = kd_coef * rkl_weighted_term.sum() / loss_mask.sum().clamp(min=1)
+
+            rkl_stat = -1 * rkl_weighted_term
+        else:
+            # KDRL: Knowledge Distillation + Reinforcement Learning (joint loss)
+            rkl_penalty_per_token = (logprobs - teacher_logp) * loss_mask
+            rkl_penalty = rkl_penalty_per_token.sum() / loss_mask.sum().clamp(min=1)
+
+            loss = rl_loss_weight * loss + distill_loss_weight * rkl_penalty
+
+            rkl_stat = rkl_penalty_per_token
+
     # Log training statistics
     stats_tracker.denominator(
         # NOTE: n_tokens must have shape [batch, seq] to match vocab stats.
@@ -442,6 +474,12 @@ def grpo_loss_fn(
         dual_clipped_tokens=stat["dual_clip_mask"],
     )
 
+    if rkl_stat is not None:
+        stats_tracker.stat(
+            rkl_loss=rkl_stat,
+            denominator="n_valid_tokens",
+        )
+
     stats_tracker.stat(
         importance_weight=stat["importance_weight"],
         approx_kl=stat["approx_kl"],
diff --git a/areal/trainer/rl_trainer.py b/areal/trainer/rl_trainer.py
@@ -184,6 +184,19 @@ def __init__(
         if self.ref is not None:
             self.ref.initialize(**engine_init_kwargs, role="ref")
 
+        self.teacher = None
+        if config.teacher is not None:
+            self.teacher = self._create_teacher(config.teacher)
+            teacher_allocation_mode = AllocationMode.from_str(
+                config.teacher.allocation_mode
+            )
+            teacher_init_kwargs = {
+                "addr": None,
+                "ft_spec": ft_spec,
+                "alloc_mode": teacher_allocation_mode,
+            }
+            self.teacher.initialize(**teacher_init_kwargs, role="teacher")
+
         # Save initial LoRA weights if enabled (for inference server pre-loading)
         initial_lora_path = self._save_initial_lora_weights()
 
@@ -372,6 +385,24 @@ def train(
                     rollout_batch["ref_logp"] = self.ref.compute_logp(rollout_batch)
                     self.ref.get_device_stats().log("ref logp")
 
+            if self.teacher is not None:
+                with (
+                    stats_tracker.record_timing("teacher_logp"),
+                    perf_tracer.trace_scope(
+                        "train.teacher_logp",
+                        category=Category.COMPUTE,
+                        args={"global_step": global_step},
+                    ),
+                ):
+                    rollout_batch["teacher_logp"] = self.teacher.compute_logp(
+                        rollout_batch
+                    )
+                    rollout_batch["rl_loss_weight"] = self.config.teacher.rl_loss_weight
+                    rollout_batch["distill_loss_weight"] = (
+                        self.config.teacher.distill_loss_weight
+                    )
+                    self.teacher.get_device_stats().log("teacher logp")
+
             with (
                 stats_tracker.record_timing("compute_advantage"),
                 perf_tracer.trace_scope(
@@ -664,6 +695,34 @@ def _create_critic(
         critic.create_process_group(parallel_strategy=self.allocation_mode.train)
         return critic
 
+    def _create_teacher(self, teacher_config):
+        allocation_mode = AllocationMode.from_str(teacher_config.allocation_mode)
+
+        if allocation_mode.train_backend == "fsdp":
+            from areal.engine.fsdp_engine import FSDPPPOActor
+
+            actor_cls = FSDPPPOActor
+        elif allocation_mode.train_backend == "megatron":
+            from areal.engine.megatron_engine import MegatronPPOActor
+
+            actor_cls = MegatronPPOActor
+        elif allocation_mode.train_backend == "archon":
+            from areal.experimental.engine.archon_engine import ArchonPPOActor
+
+            actor_cls = ArchonPPOActor
+        else:
+            raise ValueError(
+                f"Invalid backend: {allocation_mode.train_backend}, expected fsdp, megatron, or archon"
+            )
+
+        if is_single_controller():
+            teacher = actor_cls.as_controller(teacher_config, self.scheduler)
+        else:
+            teacher = actor_cls(config=teacher_config)
+
+        teacher.create_process_group(parallel_strategy=allocation_mode.train)
+        return teacher
+
     def _init_rollout(
         self,
         rollout_config: InferenceEngineConfig,
diff --git a/docs/en/algorithms/distillation.md b/docs/en/algorithms/distillation.md
@@ -0,0 +1,84 @@
+# On-Policy Distillation
+
+## Overview 
+
+On-policy distillation trains the student using teacher guidance on trajectories sampled from its own policy, reducing distribution mismatch and improving stability. Combined with reinforcement learning, it lets the student **imitate the teacher while exploring simultaneously**.
+
+**AReaL** previously supported RL for post-training. With this implementation, it now also supports **on-policy knowledge distillation** and the **combined KDRL framework**, enabling the student to learn from a teacher while exploring via RL on the same on-policy trajectories, improving both efficiency and stability.
+
+## The Core Concept
+
+Knowledge distillation aims to train the student policy $\pi_\theta$ to mimic the behavior of a more powerful teacher $\pi_T$. The choice of divergence measure and sampling distribution significantly impacts the student's final performance and exposure bias.
+
+### Supervised Fine-Tuning (Forward KL):
+
+A simple yet effective method is to maximize the log-likelihood on data generated by the teacher, known as supervised fine-tuning (SFT). This is equivalent to minimizing the Forward KL divergence between $\pi_T$ and $\pi_\theta$:
+$$\arg \min_{\theta} D_{KL}(\pi_T \parallel \pi_\theta) = \arg \max_{\theta} \mathbb{E}_{q \sim Q, o \sim \pi_T(\cdot|q)} [\log \pi_\theta(o|q)]$$
+
+
+### On-Policy Distillation (Reverse KL):
+
+While SFT is efficient, training on off-policy data induces exposure bias: a mismatch between training on teacher-generated prefixes and inference on self-generated prefixes. This is especially severe for reasoning LLMs with long response chains. To alleviate this, we can train on self-generated trajectories, which is equivalent to minimizing the Reverse KL divergence (RKL) [1]:
+$$\arg \min_{\theta} D_{KL}(\pi_\theta \parallel \pi_T) = \arg \max_{\theta} \mathbb{E}_{q \sim Q, o \sim \pi_\theta(\cdot|q)} \left[ \log \frac{\pi_T(o|q)}{\pi_\theta(o|q)} \right]$$
+
+Minimizing RKL is equivalent to REINFORCE where the "reward" is the log-ratio of teacher to student probabilities. By adopting the GRPO framework, we optimize [1]:
+
+$$J_{RKL}(\theta) = \mathbb{E}_{q, \{o_i\} \sim \pi_{\theta_{old}}} \left[ \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} \frac{\pi_\theta(o_{i,t})}{\pi_{\theta_{old}}(o_{i,t})} R_{i,t} \right]$$
+
+where the reward $R_{i,t} = \log \pi_T(o_{i,t}) - \log \pi_\theta(o_{i,t})$. This encourages the student to increase the probability of tokens the teacher prefers and suppress those it deems unlikely.
+
+- Implementation Detail:
+During pure KD, we need to set `rl_loss_weight` to 0, so the implementation estimates the RKL gradient using importance sampling. The code calculates the reward as teacher_logp - logprobs ($R_{i,t}$) and applies a negative coefficient to the loss to perform minimization (check `areal/trainer/ppo/actor.py`).
+
+
+### Combination of GRPO and KD
+We implemented KD+RL approach using a Joint Loss strategy.
+
+#### Joint Loss: 
+This strategy augments the GRPO objective with an auxiliary KL loss term. To maintain consistency with the on-policy nature of GRPO, it utilizes the Reverse KL (RKL) [1]:
+$$J_{KDRL}(\theta) = J_{GRPO}(\theta) - \beta D_{KL}(\pi_\theta \parallel \pi_T) \tag{8}$$
+
+The gradient $\nabla_\theta J_{KDRL}(\theta)$ provides an unbiased estimate of $\nabla_\theta J_{GRPO}( \theta) + \beta \cdot \nabla_\theta J_{RKL}(\theta)$.
+
+- Implementation Detail: In the joint loss case (`rl_loss_weight` > 0), the RKL is treated as a direct penalty. Minimizing the term `logprobs - teacher_logp` is mathematically equivalent to minimizing the Reverse KL objective $D_{KL}(\pi_\theta \parallel \pi_T)$ when sampling from the student distribution $\pi_\theta$. In the code, this is implemented as:
+`loss = rl_loss_weight * loss + distill_loss_weight * rkl_penalty`
+
+
+
+
+## Running the example 
+
+Need to add teacher configuration to your yaml:
+
+```yaml
+teacher:
+  allocation_mode: d1p1t4
+  rl_loss_weight: 1.0
+  distill_loss_weight: 0.005
+  experiment_name: ${experiment_name}
+  trial_name: ${trial_name}
+  path: Qwen/Qwen3-32B
+  init_from_scratch: false
+  disable_dropout: true
+  dtype: ${actor.dtype}
+  mb_spec:
+    max_tokens_per_mb: 10240
+  optimizer: null
+  scheduling_spec: ${actor.scheduling_spec}
+```
+
+Example command using local scheduler:
+
+```bash
+python3 examples/math/gsm8k_rl.py --config examples/distillation/gsm8k_grpo_distill.yaml scheduler.type=local experiment_name=gsm8k-grpo-distillation trial_name=trial0
+```
+
+## Result
+
+On-policy knowledge distillation + RL reward plot for Qwen2.5-14B-Instruct (teacher) and Qwen3-0.6B (student), trained using FSDP and vLLM.
+
+![alt text](reward_curve.png)
+
+## References
+
+[1] Xu H, Zhu Q, Deng H, Li J, Hou L, Wang Y, Shang L, Xu R, Mi F. Kdrl: Post-training reasoning llms via unified knowledge distillation and reinforcement learning. [KDRL paper link](https://arxiv.org/pdf/2506.02208)
diff --git a/docs/en/algorithms/reward_curve.png b/docs/en/algorithms/reward_curve.png
diff --git a/examples/distillation/gsm8k_grpo_distill.yaml b/examples/distillation/gsm8k_grpo_distill.yaml