update changelog and fix ptq typo (#60)

yghstill · web-flow · commit d71ca77bfb73 · 2025-09-01T21:43:07.000+08:00
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@
 - [技术交流](#技术交流)
 
 ## 📣最新进展
+- [25/09/01] 我们支持了[Hunyuan-MT-7B](https://huggingface.co/tencent/Hunyuan-MT-7B-fp8)翻译开源模型的FP8量化；支持了Eagle3的Torch推理及Benchmark评测流程；支持了[FLUX](https://github.com/Tencent/AngelSlim/tree/main/configs/flux)的量化、Cache；支持了[Seed-OSS](https://github.com/Tencent/AngelSlim/tree/main/configs/seed_oss)模型量化压缩。
 - [25/08/06] 我们支持了`Hunyuan 0.5B/1.8B/4B/7B`和`Qwen2.5VL 3B/7B/32B/72B`的FP8、INT4量化，支持了`DeepSeek-R1/V3`和`Kimi-K2`模型的`FP8-Static`、`W4A8-FP8`量化。我们还开源了`Hunyuan 1.8B/4B/7B`系列模型的Eagle3权重。
 - [25/07/04] 我们支持了`Hunyuan/Qwen2.5/Qwen3/DeepSeek-R1-Distill-Qwen`等模型的量化，包含INT8、FP8、INT4等算法。
 我们还开源了`Qwen3`系列模型的Eagle3权重。
diff --git a/README_en.md b/README_en.md
@@ -31,6 +31,7 @@ Dedicated to building a more intuitive, comprehensive, and efficient LLMs compre
 - [Technical Discussion](#technical-discussion)
 
 ## 📣Latest Updates
+- [25/09/01] We now support ​FP8 quantization​ of the [Hunyuan-MT-7B](https://huggingface.co/tencent/Hunyuan-MT-7B-fp8) translation model. And enabled ​Torch inference and Benchmark evaluation​ for Eagle3. And implemented support for ​quantization and Cache​ for [FLUX](https://github.com/Tencent/AngelSlim/tree/main/configs/flux). And support ​quantization​ for the [Seed-OSS](https://github.com/Tencent/AngelSlim/tree/main/configs/seed_oss).
 - [25/08/06] We now support quantization for `Hunyuan 0.5B/1.8B/4B/7B` and multimodal model `Qwen2.5VL 3B/7B/32B/72B`, including `FP8/INT4` algorithms, and quantization for `DeepSeek-R1/V3` and `Kimi-K2`, including `FP8-Static` and `W4A8-FP8` algorithms. We also opensource `Hunyuan 1.8B/4B/7B` series Eagle3 model weight.
 - [25/07/04] We now support quantization for `Hunyuan/Qwen2.5/Qwen3/DeepSeek-R1-Distill-Qwen` and other models, including `INT8/FP8/INT4` algorithms. We also opensource `Qwen3` series Eagle3 model weight.
 
diff --git a/angelslim/compressor/quant/modules/fp8/lepto_fp8.py b/angelslim/compressor/quant/modules/fp8/lepto_fp8.py
@@ -48,7 +48,7 @@ def __init__(
         self.ptq_hook = ptq_hook
         self.quant_model = model  # self.quant_model
         self.modal_type = self.quant_model.modal_type
-        self.layers = self.quant_model.model.model.layers
+        self.layers = self.quant_model.get_quant_module()
         self.quant_bits = self.quant_model.quant_config.quant_bit
         self.seq_length = seq_length
         self.hidden_size = hidden_size
@@ -252,9 +252,11 @@ def convert(self):
         torch.cuda.empty_cache()
 
         # 2. insert qdq module
-        layers = self.quant_model.get_model()
+        quant_convert_module = self.quant_model.get_quant_convert_module()
         for name, sub_layer in self.ptq_hook.quant_layers_dict.items():
-            parent_layer, sub_name = find_parent_layer_and_sub_name(layers, name)
+            parent_layer, sub_name = find_parent_layer_and_sub_name(
+                quant_convert_module, name
+            )
 
             qdq_module = self.quant_model.get_qdq_module(sub_layer, name)
             setattr(parent_layer, sub_name, qdq_module)
diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
@@ -36,7 +36,6 @@ def __init__(self, model, slim_config=None):
         self.quant_model = model
         # init ptq config of model
         self.quant_model.init_ptq(slim_config)
-        self.layers = self.quant_model.get_quant_module()
         self.quant_algo = self.quant_model.quant_config.quant_algo
         self.quant_helpers = self.quant_model.quant_config.quant_helpers
         if "fp8" in self.quant_algo or "int8" in self.quant_algo:
@@ -210,9 +209,12 @@ def _convert(self):
 
         self.ptq_hook.post_process()
 
+        quant_convert_module = self.quant_model.get_quant_convert_module()
         # 2. insert qdq module
         for name, sub_layer in self.ptq_hook.quant_layers_dict.items():
-            parent_layer, sub_name = find_parent_layer_and_sub_name(self.layers, name)
+            parent_layer, sub_name = find_parent_layer_and_sub_name(
+                quant_convert_module, name
+            )
 
             qdq_module = self.quant_model.get_qdq_module(sub_layer, name)
             setattr(parent_layer, sub_name, qdq_module)
diff --git a/angelslim/models/base_model.py b/angelslim/models/base_model.py
@@ -116,6 +116,13 @@ def get_quant_module(self):
         """
         return self.model.model.layers
 
+    def get_quant_convert_module(self):
+        """
+        Returns the module that will be converted to quantized.
+        This is typically the main transformer module of the model.
+        """
+        return self.model
+
     def get_qdq_module(self, sub_layer, name):
         act_scale, weight_scale = None, None
         if name in self.act_scales_dict:
diff --git a/angelslim/models/diffusion/flux.py b/angelslim/models/diffusion/flux.py
@@ -159,6 +159,13 @@ def get_quant_module(self):
         """
         return self.model.transformer
 
+    def get_quant_convert_module(self):
+        """
+        Returns the module that will be converted to quantized.
+        This is typically the main transformer module of the model.
+        """
+        return self.model.transformer
+
     def get_save_func(self):
         if self.deploy_backend in ["huggingface"]:
             return PTQDiffusionSave