Tencent
diff --git a/‎angelslim/compressor/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎angelslim/compressor/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎angelslim/compressor/distill/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎angelslim/compressor/distill/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎angelslim/compressor/distill/distill.py‎
Lines changed: 18 additions & 94 deletions b/‎angelslim/compressor/distill/distill.py‎
Lines changed: 18 additions & 94 deletions
diff --git a/‎…m/compressor/qat/plugins/distill_loss.py‎ ‎angelslim/compressor/distill/loss.py‎angelslim/compressor/qat/plugins/distill_loss.py renamed to angelslim/compressor/distill/loss.py
Lines changed: 1 addition & 1 deletion b/‎…m/compressor/qat/plugins/distill_loss.py‎ ‎angelslim/compressor/distill/loss.py‎angelslim/compressor/qat/plugins/distill_loss.py renamed to angelslim/compressor/distill/loss.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎angelslim/compressor/distill/trainer.py‎
Lines changed: 1 addition & 1 deletion b/‎angelslim/compressor/distill/trainer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎angelslim/compressor/qad/__init__.py‎
Lines changed: 17 additions & 0 deletions b/‎angelslim/compressor/qad/__init__.py‎
Lines changed: 17 additions & 0 deletions
@@ -14,5 +14,6 @@
 
 from .compressor_factory import CompressorFactory  # noqa: F401
 from .distill import Distill  # noqa: F401
+from .qad import QAD  # noqa: F401
 from .qat.qat import QAT  # noqa: F401
 from .quant import PTQ  # noqa: F401
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 from .distill import Distill  # noqa: F401
+from .loss import DistillLoss  # noqa: F401
 
-__all__ = ["Distill"]
+__all__ = ["Distill", "DistillLoss"]
@@ -22,73 +22,45 @@
 from ...data.qat_dataset import QATDataset
 from ...utils import patch_deepspeed_duplicate_check, print_info
 from ..compressor_factory import CompressorFactory
-from ..qat.plugins import PluginManager
-from ..qat.qat import QAT
 from .trainer import DistillSeq2SeqTrainer
 
 
-def _unique_named_params(model, predicate):
-    seen = set()
-    result = []
-    for name, param in model.named_parameters():
-        if id(param) in seen or not predicate(name, param):
-            continue
-        seen.add(id(param))
-        result.append(param)
-    return result
-
-
 def _normalize_device_map(device_map):
     if isinstance(device_map, str) and device_map.lower() in ("none", "distributed"):
         return None
     return device_map
 
 
 @CompressorFactory.register
-class Distill(QAT):
+class Distill:
+    """Full-precision knowledge distillation.
+
+    Quantized-student distillation lives in ``angelslim.compressor.qad``.
+    Keeping this path fp-only prevents it from inheriting QAT state or save
+    semantics by accident.
+    """
+
     def __init__(self, model, slim_config=None):
         self.quant_model = model
         self.config = slim_config
         self.distill_config = slim_config["compress_config"].Distill
-        self.student_type = self.distill_config.student_type.lower()
+        self.student_type = getattr(self.distill_config, "student_type", "fp").lower()
         self.trainable_parameters = self.distill_config.trainable_parameters.lower()
         self.save_fmt = self.distill_config.save_format
-        self.plugin_config = self.distill_config.plugin_config
-        self.plugin_manager = PluginManager()
         self.trainer = SimpleNamespace(external_trainer=None)
-        self._rank0_state_dict = None
         self.teacher_model = None
         self.train_dataset = None
 
         self._validate_config()
-        self.is_quantized_student = self.student_type == "quantized"
-        if self.is_quantized_student:
-            self.quant_model.init_ptq(slim_config)
-            self.quant_info = self.quant_model.quant_config
-            self._init_plugins()
-        else:
-            self.quant_info = None
 
     def _validate_config(self):
         if not self.distill_config.teacher_model_path:
             raise ValueError("Distill requires compression.Distill.teacher_model_path.")
-        if self.student_type not in ("fp", "quantized"):
-            raise ValueError("Distill student_type must be 'fp' or 'quantized'.")
-        if self.trainable_parameters not in ("all", "quant"):
-            raise ValueError("Distill trainable_parameters must be 'all' or 'quant'.")
-        if self.student_type == "fp" and self.trainable_parameters == "quant":
-            raise ValueError("trainable_parameters='quant' requires a quantized student.")
-
-    def _init_plugins(self):
-        if self.plugin_config.get("enable_scale", False):
-            self.plugin_manager.register_plugin(
-                "learnable_scale",
-                quant_info=self.quant_info,
-                ignore_layers=self.config["compress_config"].quantization.ignore_layers,
-                resume_ckpt_dir=self.distill_config.resume_ckpt_dir,
-                from_ptq_ckpt_dir=self.distill_config.from_ptq_ckpt,
-                config=self.plugin_config.get("quant_config", {}),
-                quant_model=self.quant_model,
+        if self.student_type != "fp":
+            raise ValueError("Distill only supports fp students. Use QAD for quantized students.")
+        if self.trainable_parameters != "all":
+            raise ValueError(
+                "Distill trainable_parameters must be 'all'. Use QAD for quant params."
             )
 
     def _prepare_dataset(self, dataloader):
@@ -128,49 +100,11 @@ def _load_teacher_model(self):
 
     def _apply_trainable_parameters(self):
         model = self.quant_model.model
-        if self.trainable_parameters == "all":
-            for param in model.parameters():
-                param.requires_grad = True
-            return
-
-        if not any(param.requires_grad for param in model.parameters()):
-            raise ValueError("Distill quant optimizer has no trainable parameters.")
+        for param in model.parameters():
+            param.requires_grad = True
 
     def _init_optimizer(self):
-        if self.trainable_parameters == "all":
-            return None
-
-        lr = float(self.distill_config.hf_args.get("learning_rate", 1e-5))
-        wd = float(self.distill_config.hf_args.get("weight_decay", 0))
-        lwc_names = ("clip_factor_w_max", "clip_factor_w_min")
-        base_params = _unique_named_params(
-            self.quant_model.model,
-            lambda n, p: p.requires_grad and not any(key in n for key in lwc_names),
-        )
-        params = [{"params": base_params, "weight_decay": wd, "lr": lr}]
-
-        lwc_params = _unique_named_params(
-            self.quant_model.model,
-            lambda n, p: p.requires_grad and any(key in n for key in lwc_names),
-        )
-        if lwc_params:
-            lwc_lr = float(
-                self.plugin_config.get("quant_config", {}).get("lwc", {}).get("lwc_lr", lr)
-            )
-            params.append({"params": lwc_params, "weight_decay": wd, "lr": lwc_lr})
-            print_info(
-                f"Init distill optimizer with {len(base_params)} params, "
-                f"{len(lwc_params)} lwc params, lr={lr}, lwc_lr={lwc_lr}, weight_decay={wd}"
-            )
-        else:
-            print_info(
-                f"Init distill optimizer with {len(base_params)} params, "
-                f"lr={lr}, weight_decay={wd}"
-            )
-
-        if not any(group["params"] for group in params):
-            raise ValueError("Distill optimizer has no trainable parameters.")
-        return torch.optim.AdamW(params)
+        return None
 
     def _prepare_trainer(self, place_teacher_on_device):
         optimizer = self._init_optimizer()
@@ -212,9 +146,6 @@ def _load_resume_checkpoint(self):
 
     def run(self, dataloader):
         self._prepare_dataset(dataloader)
-        if self.is_quantized_student:
-            self.plugin_manager.call_before_train(train_dataset=self.train_dataset)
-
         self._apply_trainable_parameters()
         self._load_resume_checkpoint()
         self.teacher_model, place_teacher_on_device = self._load_teacher_model()
@@ -223,17 +154,10 @@ def run(self, dataloader):
         if self.distill_config.do_train:
             self.trainer.external_trainer.train()
 
-        if self.is_quantized_student:
-            self.plugin_manager.call_after_train()
-
     def convert(self):
-        if self.is_quantized_student:
-            super().convert()
+        return None
 
     def save(self, save_path: str):
-        if self.is_quantized_student:
-            return super().save(save_path)
-
         if self.save_fmt not in ("hf", "real", "full"):
             print_info("Save format not specified, skip save.")
             return None
 
@@ -96,7 +96,7 @@ def compute(self, student_logits, teacher_logits, labels):
             kd = self._kl_from_logps(top_s_logp, top_t_logp).mean()
         else:
             raise ValueError(
-                f"Unsupported QAT kd loss_type: {self.loss_type}. "
+                f"Unsupported distill loss_type: {self.loss_type}. "
                 "Valid: kl, rkl, mse, kd, cakld, kl_top[_K], r_kl_top[_K]."
             )
 
 
@@ -17,7 +17,7 @@
 import torch
 from transformers import Seq2SeqTrainer
 
-from ..qat.plugins.distill_loss import DistillLoss
+from .loss import DistillLoss
 
 
 class DistillSeq2SeqTrainer(Seq2SeqTrainer):
 
@@ -0,0 +1,17 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .qad import QAD  # noqa: F401
+
+__all__ = ["QAD"]
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ def compute(self, student_logits, teacher_logits, labels):`
`96`	`96`	`kd = self._kl_from_logps(top_s_logp, top_t_logp).mean()`
`97`	`97`	`else:`
`98`	`98`	`raise ValueError(`
`99`		`- f"Unsupported QAT kd loss_type: {self.loss_type}. "`
	`99`	`+ f"Unsupported distill loss_type: {self.loss_type}. "`
`100`	`100`	`"Valid: kl, rkl, mse, kd, cakld, kl_top[_K], r_kl_top[_K]."`
`101`	`101`	`)`
`102`	`102`