From e3e2b2c2ea0205c9ae4966f688ef82c186d56c87 Mon Sep 17 00:00:00 2001
From: root <root@TENCENT64.site>
Date: Tue, 13 Jan 2026 19:10:46 +0800
Subject: [PATCH 1/4] feature: support Qwen3_VL_Moe static quantization

---
 angelslim/compressor/quant/core/quant_func.py |   7 +
 angelslim/compressor/quant/ptq.py             | 131 ++++++
 angelslim/data/multimodal_dataset.py          |  95 ++++-
 angelslim/models/vlm/__init__.py              |   1 +
 angelslim/models/vlm/qwen3_vl_moe.py          | 387 ++++++++++++++++++
 angelslim/utils/config_parser.py              |   2 +-
 .../qwen3_vl-235b_a3b_fp8_static.yaml         |  36 ++
 .../qwen3_vl-30b_a3b_fp8_static.yaml          |  36 ++
 .../fake_data_openai_formate.json             |   2 +
 9 files changed, 676 insertions(+), 21 deletions(-)
 create mode 100644 angelslim/models/vlm/qwen3_vl_moe.py
 create mode 100644 configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml
 create mode 100644 configs/qwen3_vl/fp8_static/qwen3_vl-30b_a3b_fp8_static.yaml
 create mode 100644 dataset/multimodal_fake_data/fake_data_openai_formate.json

diff --git a/angelslim/compressor/quant/core/quant_func.py b/angelslim/compressor/quant/core/quant_func.py
index bb4ea8d3..18280189 100644
--- a/angelslim/compressor/quant/core/quant_func.py
+++ b/angelslim/compressor/quant/core/quant_func.py
@@ -73,10 +73,17 @@ def quantize_weight_per_tensor_fp8(
 ) -> Tuple[torch.Tensor, float]:
     finfo = torch.finfo(torch.float8_e4m3fn)
 
+    squeeze_dim = False
+    while scale.ndim < tensor.ndim:
+        scale = scale.unsqueeze(-1)
+        squeeze_dim = True
+
     qweight = (tensor / scale).clamp(min=finfo.min, max=finfo.max)
     # Return both float8 data and the inverse scale (as float),
     # as both required as inputs to torch._scaled_mm
     qweight = qweight.to(torch.float8_e4m3fn)
+    if squeeze_dim:
+        scale = scale.squeeze(-1)
     scale = scale.float()
     return qweight, scale
 
diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
index 8aabb1f2..39252b7f 100644
--- a/angelslim/compressor/quant/ptq.py
+++ b/angelslim/compressor/quant/ptq.py
@@ -18,6 +18,12 @@
 
 import torch
 from safetensors.torch import load_file
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
+
+from angelslim.compressor.quant.core.quant_func import (
+    get_fp_maxval,
+    quantize_weight_per_tensor_fp8,
+)
 
 from ...utils import find_parent_layer_and_sub_name, print_info
 from ..compressor_factory import CompressorFactory
@@ -284,7 +290,132 @@ def _convert(self):
 
             if qdq_module is not sub_layer:
                 setattr(parent_layer, sub_name, qdq_module)
+
+        maxval = get_fp_maxval(bits=8)
+        for name, sub_layer in self.quant_model.model.named_modules():
+            if isinstance(sub_layer, Qwen3VLMoeTextExperts):
+                parent_layer, sub_name = find_parent_layer_and_sub_name(
+                    quant_convert_module, name
+                )
+                gate_up_act_max = sub_layer.gateupobservers.scales()
+                down_act_max = sub_layer.downobservers.scales()
+                gate_up_act_dtype = gate_up_act_max.dtype
+                down_act_dtype = down_act_max.dtype
+                gate_up_act_scale = gate_up_act_max / maxval.type(gate_up_act_dtype)
+                down_act_scale = down_act_max / maxval.type(down_act_dtype)
+
+                gate_proj, up_proj = sub_layer.gate_up_proj.chunk(2, dim=-1)
+                abs_inputs = torch.abs(gate_proj)
+                batch_size = abs_inputs.shape[0]
+                abs_inputs_flat = abs_inputs.view(batch_size, -1)
+                gate_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
+
+                abs_inputs = torch.abs(up_proj)
+                batch_size = abs_inputs.shape[0]
+                abs_inputs_flat = abs_inputs.view(batch_size, -1)
+                up_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
+
+                abs_inputs = torch.abs(sub_layer.down_proj)
+                batch_size = abs_inputs.shape[0]
+                abs_inputs_flat = abs_inputs.view(batch_size, -1)
+                down_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
+
+                gate_weight_dtype = gate_proj.dtype
+                up_weight_dtype = up_proj.dtype
+                down_weight_dtype = sub_layer.down_proj.dtype
+                gate_weight_scale = gate_weight_max / maxval.type(gate_weight_dtype)
+                up_weight_scale = up_weight_max / maxval.type(up_weight_dtype)
+                down_weight_scale = down_weight_max / maxval.type(down_weight_dtype)
+
+                q_linear = MyQDQModule(
+                    gate_proj=gate_proj.cpu(),
+                    up_proj=up_proj.cpu(),
+                    down_proj=sub_layer.down_proj.cpu(),
+                    gate_proj_weight_scale=gate_weight_scale.cpu(),
+                    up_proj_weight_scale=up_weight_scale.cpu(),
+                    down_proj_weight_scale=down_weight_scale.cpu(),
+                    gate_up_proj_input_scale=gate_up_act_scale.cpu(),
+                    down_proj_input_scale=down_act_scale.cpu(),
+                )
+                setattr(parent_layer, sub_name, q_linear)
         self.quant_model.quantized = True
 
     def __getattr__(self, item):
         return super().__getattr__(item)
+
+
+class MyQDQModule(torch.nn.Module):
+    def __init__(
+        self,
+        gate_proj: torch.nn.Parameter,
+        up_proj: torch.nn.Parameter,
+        down_proj: torch.nn.Parameter,
+        gate_proj_weight_scale: torch.nn.Parameter,
+        up_proj_weight_scale: torch.nn.Parameter,
+        down_proj_weight_scale: torch.nn.Parameter,
+        gate_up_proj_input_scale: torch.nn.Parameter,
+        down_proj_input_scale: torch.nn.Parameter,
+    ):
+        super().__init__()
+        quant_gate_weight, _ = quantize_weight_per_tensor_fp8(
+            gate_proj, gate_proj_weight_scale
+        )
+        quant_up_weight, _ = quantize_weight_per_tensor_fp8(
+            up_proj, up_proj_weight_scale
+        )
+        quant_down_weight, _ = quantize_weight_per_tensor_fp8(
+            down_proj, down_proj_weight_scale
+        )
+        quant_gate_up_weight = torch.cat([quant_gate_weight, quant_up_weight], dim=-1)
+
+        self.gate_up_proj = torch.nn.Parameter(
+            quant_gate_up_weight, requires_grad=False
+        )
+        self.down_proj = torch.nn.Parameter(quant_down_weight, requires_grad=False)
+
+        gate_proj_weight_scale = (
+            gate_proj_weight_scale.view(-1)
+            if gate_proj_weight_scale.ndim == 0
+            else gate_proj_weight_scale
+        )
+        up_proj_weight_scale = (
+            up_proj_weight_scale.view(-1)
+            if up_proj_weight_scale.ndim == 0
+            else up_proj_weight_scale
+        )
+        down_proj_weight_scale = (
+            down_proj_weight_scale.view(-1)
+            if down_proj_weight_scale.ndim == 0
+            else down_proj_weight_scale
+        )
+        gate_up_proj_weight_scale = torch.cat(
+            [gate_proj_weight_scale, up_proj_weight_scale], dim=-1
+        )
+
+        self.gate_up_proj_weight_scale = torch.nn.Parameter(
+            gate_up_proj_weight_scale, requires_grad=False
+        )
+        self.down_proj_weight_scale = torch.nn.Parameter(
+            down_proj_weight_scale, requires_grad=False
+        )
+
+        down_proj_input_scale = (
+            down_proj_input_scale.view(-1)
+            if down_proj_input_scale.ndim == 0
+            else down_proj_input_scale.squeeze()
+        )
+        gate_up_proj_input_scale = (
+            gate_up_proj_input_scale.view(-1)
+            if gate_up_proj_input_scale.ndim == 0
+            else gate_up_proj_input_scale.squeeze()
+        )
+
+        self.gate_up_proj_input_scale = torch.nn.Parameter(
+            gate_up_proj_input_scale, requires_grad=False
+        )
+        self.down_proj_input_scale = torch.nn.Parameter(
+            down_proj_input_scale, requires_grad=False
+        )
+
+    def forward(self, x):
+        pass
diff --git a/angelslim/data/multimodal_dataset.py b/angelslim/data/multimodal_dataset.py
index a51a6ccb..c6e80ea5 100644
--- a/angelslim/data/multimodal_dataset.py
+++ b/angelslim/data/multimodal_dataset.py
@@ -49,7 +49,7 @@ def __init__(
 
     def _load_file_based_dataset(self, data_path: str, num_samples: int):
         """Load dataset from local file system"""
-        image_dir = os.path.join(os.path.dirname(data_path), "images")
+        self.data_path = data_path
         line_count = 0
 
         with open(data_path, "r") as f:
@@ -58,29 +58,84 @@ def _load_file_based_dataset(self, data_path: str, num_samples: int):
                     break
 
                 data = json.loads(line.strip())
-                image_path = os.path.join(image_dir, data["img_path"])
 
-                # Prepare chat messages with image
-                messages = [
-                    {
-                        "role": "user",
-                        "content": [
-                            {"type": "image", "image": image_path},
-                            {
-                                "type": "text",
-                                "text": data["question"].replace("<image>", ""),
-                            },
-                        ],
-                    },
-                    {
-                        "role": "assistant",
-                        "content": [{"type": "text", "text": data["answer"]}],
-                    },
-                ]
+                # Validate format
+                assert "messages" in data or "question" in data, "JSON format error"
+
+                # Prepare messages
+                messages = self._prepare_messages(data)
 
                 self._process_and_append(messages)
                 line_count += 1
 
+    def _prepare_messages(self, data: Dict) -> List[Dict]:
+        image_dir = os.path.join(os.path.dirname(self.data_path), "images")
+        if "question" in data:
+            # Prepare chat messages with image
+            messages = []
+            if "system_prompt" in data:
+                messages.extend(
+                    [
+                        {
+                            "role": "system",
+                            "content": [
+                                {"type": "text", "text": data["system_prompt"]}
+                            ],
+                        }
+                    ]
+                )
+            if "img_path" in data:
+                image_path = os.path.join(image_dir, data["img_path"])
+                messages.extend(
+                    [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "image", "image": image_path},
+                                {
+                                    "type": "text",
+                                    "text": data["question"].replace("<image>", ""),
+                                },
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": [{"type": "text", "text": data["answer"]}],
+                        },
+                    ]
+                )
+            else:
+                messages.extend(
+                    [
+                        {
+                            "role": "user",
+                            "content": [
+                                {"type": "text", "text": data["question"]},
+                            ],
+                        },
+                        {
+                            "role": "assistant",
+                            "content": [{"type": "text", "text": data["answer"]}],
+                        },
+                    ]
+                )
+        elif "messages" in data:
+            messages = data["messages"]
+            for message in messages:
+                if message["role"] == "user":
+                    for content in message["content"]:
+                        if content["type"] == "image":
+                            content["image"] = os.path.join(image_dir, content["image"])
+        else:
+            raise ValueError("Invalid data format")
+
+        # adapt to hunyuan_vl
+        if self.model_name in ["HunyuanVL"]:
+            for message in messages:
+                if message["role"] == "assistant" or message["role"] == "system":
+                    message["content"] = message["content"][0]["text"]
+        return messages
+
     def _load_hf_dataset(self, dataset: str, num_samples: int):
         """Load dataset from Hugging Face format"""
         dataset = load_dataset(dataset, split="test")
@@ -108,7 +163,7 @@ def _load_hf_dataset(self, dataset: str, num_samples: int):
 
     def _process_and_append(self, messages: List[Dict]):
         """Process messages and append to dataset"""
-        if self.model_name in ["Qwen3VL"]:
+        if self.model_name in ["Qwen3VL", "Qwen3VLMoE"]:
             inputs = self.processor.apply_chat_template(
                 messages,
                 tokenize=True,
diff --git a/angelslim/models/vlm/__init__.py b/angelslim/models/vlm/__init__.py
index 6eccd510..ce45e949 100644
--- a/angelslim/models/vlm/__init__.py
+++ b/angelslim/models/vlm/__init__.py
@@ -14,4 +14,5 @@
 
 from .hunyuan_vl import HunyuanVL  # noqa: F401
 from .qwen3_vl import Qwen3VL  # noqa: F401
+from .qwen3_vl_moe import Qwen3VLMoE  # noqa: F401
 from .qwen_vl import QwenVL  # noqa: F401
diff --git a/angelslim/models/vlm/qwen3_vl_moe.py b/angelslim/models/vlm/qwen3_vl_moe.py
new file mode 100644
index 00000000..5c28f532
--- /dev/null
+++ b/angelslim/models/vlm/qwen3_vl_moe.py
@@ -0,0 +1,387 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from tqdm import tqdm
+from transformers import (
+    AutoProcessor,
+    AutoTokenizer,
+    Qwen3VLMoeForConditionalGeneration,
+)
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
+
+from angelslim.compressor.quant.observers.base_observer import BaseObserver
+
+from ...compressor.quant.core import LossFilter, PTQVLMSaveVllmHF
+from ...utils import find_layers, print_info
+from ..base_model import BaseLLMModel
+from ..model_factory import SlimModelFactory
+
+
+def observer_forward(
+    self,
+    hidden_states: torch.Tensor,
+    routing_weights: torch.Tensor,
+    router_indices: torch.Tensor,
+) -> torch.Tensor:
+    """
+    When training it is more efficient to just loop over the experts and
+    compute the output for each expert
+    as otherwise the memory would explode.
+
+    For inference we can sacrifice some memory and compute the output for
+    all experts at once. By repeating the inputs.
+
+    Args:
+        hidden_states (torch.Tensor): (batch_size * token_num, hidden_size)
+        routing_weights (torch.Tensor): (batch_size * token_num, num_experts)
+        router_indices (torch.Tensor): (batch_size * token_num, top_k)
+    Returns:
+        torch.Tensor
+    """
+    # replace Qwen3VLMoeTextExperts forward function by observer_forward"
+    batch_size = hidden_states.shape[0]
+    hidden_states = hidden_states.reshape(
+        -1, self.hidden_size
+    )  # (num_tokens, hidden_size)
+    if self.training:
+        next_states = torch.zeros_like(
+            hidden_states, dtype=hidden_states.dtype, device=hidden_states.device
+        )
+        with torch.no_grad():
+            expert_mask = torch.nn.functional.one_hot(
+                router_indices, num_classes=self.num_experts
+            )
+            expert_mask = expert_mask.permute(2, 1, 0)
+            # we sum on the top_k and on the sequence length to get which experts
+            # are hit this time around
+            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit[:]:
+            with torch.no_grad():
+                _, token_idx = torch.where(expert_mask[expert_idx[0]])
+            current_state = hidden_states[token_idx]
+            gate_up = current_state @ self.gate_up_proj[expert_idx]
+            gate, up = gate_up.chunk(2, dim=-1)
+            gated_output = up * self.act_fn(gate)
+            out = gated_output @ self.down_proj[expert_idx]
+            weighted_output = out[0] * routing_weights[token_idx, expert_idx, None]
+            next_states.index_add_(
+                0, token_idx, weighted_output.to(hidden_states.dtype)
+            )
+        next_states = next_states.view(batch_size, -1, self.hidden_size)
+    else:
+        hidden_states = hidden_states.repeat(self.num_experts, 1)
+        hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
+        if self.gateupobservers:
+            self.gateupobservers(hidden_states)
+        gate_up = torch.bmm(hidden_states, self.gate_up_proj)
+        gate, up = gate_up.chunk(2, dim=-1)  # not supported for DTensors
+        if self.downobservers:
+            down_input = up * self.act_fn(gate)
+            self.downobservers(down_input)
+        next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj)
+        next_states = next_states.reshape(
+            self.num_experts, batch_size, -1, self.hidden_size
+        )
+        next_states = (
+            next_states
+            * routing_weights.transpose(0, 1).view(self.num_experts, batch_size, -1)[
+                ..., None
+            ]
+        )
+        next_states = next_states.sum(dim=0)
+    return next_states
+
+
+Qwen3VLMoeTextExperts.forward = observer_forward
+
+
+@SlimModelFactory.register
+class Qwen3VLMoE(BaseLLMModel):
+    def __init__(
+        self,
+        model=None,
+        deploy_backend="vllm",
+    ):
+        super().__init__(
+            model=model,
+            deploy_backend=deploy_backend,
+        )
+        self.modal_type = "VLM"
+        self.block_name = "model.language_model.layers"
+        self.vit_block_name = "model.visual.blocks"
+        self.pre_transformer_module_names = [
+            "visual",
+            "language_model.embed_tokens",
+            "language_model.norm",
+            "language_model.rotary_emb",
+        ]
+        self.observer_layer_classes = [nn.Linear, Qwen3VLMoeTextExperts]
+
+    def from_pretrained(
+        self,
+        model_path,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        use_cache=False,
+        using_multi_nodes=False,
+        compress_config=None,
+    ):
+        self.model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
+            model_path,
+            torch_dtype=torch_dtype,
+            device_map=device_map,
+            trust_remote_code=trust_remote_code,
+            low_cpu_mem_usage=low_cpu_mem_usage,
+        )
+
+        # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            model_path, trust_remote_code=trust_remote_code
+        )
+
+        # Load processor
+        self.processor = AutoProcessor.from_pretrained(
+            model_path, trust_remote_code=trust_remote_code
+        )
+
+    def get_observer_layers(self):
+        names = [
+            "self_attn.k_proj",
+            "self_attn.v_proj",
+            "self_attn.q_proj",
+            "self_attn.o_proj",
+        ]
+
+        if hasattr(self.quant_config, "quant_vit") and self.quant_config.quant_vit:
+            vit_names = ["attn.qkv", "attn.proj", "mlp.linear_fc1", "mlp.linear_fc2"]
+            names.extend(vit_names)
+
+        observer_layers_dict = {}
+        layers_dict = find_layers(self.model, layers=self.observer_layer_classes)
+
+        ignore_layers = self.skip_layer_names()
+        for name, module in layers_dict.items():
+            block_condition = name.startswith(self.block_name) or (
+                hasattr(self.quant_config, "quant_vit")
+                and self.quant_config.quant_vit
+                and name.startswith(self.vit_block_name)
+            )
+            parts = name.split(".")
+            result = ".".join(parts[-2:])
+            if result == "mlp.experts":
+                if not hasattr(module, "gateupobservers"):
+                    layername = name + ".gate_up"
+                    module.gateupobservers = MyAbsmaxPertensorObserver(
+                        layer_name=layername
+                    )
+                if not hasattr(module, "downobservers"):
+                    layername = name + ".down"
+                    module.downobservers = MyAbsmaxPertensorObserver(
+                        layer_name=layername
+                    )
+            else:
+                if block_condition and result in names:
+                    observer_layers_dict[name] = module
+                else:
+                    ignore_layers.append(name)
+        self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers
+        if self.quant_config.custom_observe_layers_names != "default":
+            for custom_observe_name in self.quant_config.custom_observe_layers_names:
+                for default_name in observer_layers_dict.keys():
+                    if custom_observe_name not in default_name:
+                        observer_layers_dict.pop(default_name)
+        return observer_layers_dict
+
+    def model_forward(self, dataloader, **kwargs):
+        self.model.use_cache = False
+
+        calibrated_cnt = 0
+        if (
+            "gptq" in self.quant_config.quant_algo
+            or "awq" in self.quant_config.quant_algo
+            or "gptaq" in self.quant_config.quant_algo
+        ):
+            device = "cuda:0"
+        else:
+            device = self.model.device
+        print_info(f"device is {device}")
+        if dataloader is not None:
+            loss_filter = LossFilter(processor=self.processor)
+            with torch.no_grad():
+                for batch in tqdm(
+                    dataloader, desc="calibrating...", total=len(dataloader)
+                ):
+                    if "pixel_values" in batch:
+                        inputs = {
+                            "input_ids": batch["input_ids"].to(device),
+                            "attention_mask": batch["attention_mask"].to(device),
+                            "pixel_values": batch["pixel_values"].to(device),
+                            "image_grid_thw": batch["image_grid_thw"].to(device),
+                        }
+                    else:
+                        inputs = {
+                            "input_ids": batch["input_ids"].to(device),
+                            "attention_mask": batch["attention_mask"].to(device),
+                        }
+
+                    inputs = {k: v.to(device) for k, v in inputs.items()}
+                    inputs["use_cache"] = False
+                    labels = batch["labels"].to(device)
+                    attention_mask = batch["attention_mask"].to(device)
+                    try:
+                        outputs = self.model(**inputs)
+                        logits = outputs.logits.float()
+
+                        loss = F.cross_entropy(
+                            logits.view(-1, logits.size(-1)),
+                            labels.view(-1),
+                            reduction="none",
+                        )
+
+                        attention_mask = (
+                            attention_mask.view(-1).to(logits.device).float()
+                        )
+                        loss = loss * attention_mask
+                        loss = loss_filter.filter_loss(
+                            loss=loss, labels=labels, model_type="Qwen3VL"
+                        )
+                        avg_loss = loss.mean()
+                        ppl = torch.exp(avg_loss)
+
+                        print_info(f"ppl is : {ppl:.4f}")
+
+                        calibrated_cnt += 1
+                    except ValueError:
+                        calibrated_cnt += 1
+                        pass
+                    inputs = {
+                        k: v.to("cpu") if isinstance(v, torch.Tensor) else v
+                        for k, v in inputs.items()
+                    }
+                    attention_mask = attention_mask.to("cpu")
+                    labels = labels.to("cpu")
+                    del outputs, inputs
+                    torch.cuda.synchronize()
+                    torch.cuda.empty_cache()
+                    gc.collect()
+
+    def get_quant_module(self):
+        """
+        Returns the module that will be quantized.
+        This is typically the main transformer module of the model.
+        """
+        return self.model.model.language_model.layers
+
+    def get_save_func(self):
+        if self.deploy_backend in ["vllm", "huggingface"]:
+            return PTQVLMSaveVllmHF
+        else:
+            raise NotImplementedError(
+                f"deploy_backend {self.deploy_backend} is not supported for saving."
+            )
+
+
+class MyAbsmaxPertensorObserver(BaseObserver):
+    def __init__(self, layer_name=None, quant_bits=8, **kwargs):
+        super(MyAbsmaxPertensorObserver, self).__init__(quant_bits=quant_bits)
+        self.layer_name = layer_name
+        self._scale = None
+        self._zero_point = None
+        self._min = None
+        self._max = torch.tensor(1e-7, dtype=torch.float32)
+        self.step = 0
+        self.dtype = None
+        self.parent_observer = (
+            kwargs["parent_observer"]
+            if kwargs and "parent_observer" in kwargs
+            else None
+        )
+
+    def forward(self, inputs):
+        """Calculate forward pass."""
+        self.step += 1
+        if not self.dtype:
+            self.dtype = inputs.dtype
+        if inputs.numel() > 0:
+            self._min, self._max = self._cal_min_max(inputs)
+            if self.parent_observer is not None:
+                self.parent_observer.update(self._min, self._max, self.step)
+        else:
+            assert self.parent_observer is not None
+            self._update_min_max(self.parent_observer.min, self.parent_observer.max)
+        return inputs
+
+    def _cal_min_max(self, inputs):
+        if inputs.dim() >= 2:
+            abs_inputs = torch.abs(inputs)
+            batch_size = abs_inputs.shape[0]
+            abs_inputs_flat = abs_inputs.view(
+                batch_size, -1
+            )  # [batch_size, seq_len * hidden_dim]
+            abs_max_val, _ = torch.max(
+                abs_inputs_flat, dim=1, keepdim=True
+            )  # [batch_size, 1]
+            min_threshold = self._max.to(abs_max_val.device).expand_as(abs_max_val)
+            abs_max_val = torch.maximum(abs_max_val, min_threshold)
+        else:
+            abs_max_val = torch.max(torch.abs(inputs))
+            if abs_max_val.data < self._max.data:
+                abs_max_val = self._max
+            abs_max_val = abs_max_val.unsqueeze(0).unsqueeze(0)  # [1, 1]
+        return 0, abs_max_val.to(inputs.device)
+
+    def _update_min_max(self, min, max):
+        if min is not None and max is not None:
+            if self._min is None or min < self._min:
+                self._min = min
+            if self._max is None or max > self._max:
+                self._max = max
+
+    def cal_thresholds(self):
+        """Compute thresholds for MAX function."""
+        if self._scale is None:
+            self._scale = self._max
+        self._zero_point = torch.zeros_like(self._scale)
+
+    def quant_axis(self):
+        """Return quantization axis."""
+        return -1
+
+    def scales(self):
+        """Return output scales."""
+        if self.step == 0 and self.parent_observer is not None:
+            self._update_min_max(self.parent_observer.min, self.parent_observer.max)
+            self.step = self.parent_observer.step
+        if self.step == 0:
+            raise ValueError(
+                "AbsmaxPertensorObserver scales must calibrate data first!"
+            )
+        if self._scale is None:
+            self.cal_thresholds()
+        if self.dtype:
+            self._scale = self._scale.type(self.dtype)
+        return self._scale
+
+    def zero_points(self):
+        """Return output zero points."""
+        if self._zero_point is None:
+            self.cal_thresholds()
+        return self._zero_point
diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py
index 5c048f73..8acb50cf 100644
--- a/angelslim/utils/config_parser.py
+++ b/angelslim/utils/config_parser.py
@@ -92,7 +92,7 @@ def get_max_seq_length(self) -> int:
     def set_model_hidden_size(self, model_path) -> int:
         json_data = get_hf_config(model_path)
         try:
-            if json_data["model_type"] in ["qwen3_vl"]:
+            if json_data["model_type"] in ["qwen3_vl", "qwen3_vl_moe"]:
                 self.hidden_size = json_data["text_config"]["hidden_size"]
             elif (
                 json_data["architectures"][0]
diff --git a/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml b/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml
new file mode 100644
index 00000000..f1251767
--- /dev/null
+++ b/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml
@@ -0,0 +1,36 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen3VLMoE
+  model_path: Qwen/Qwen3-VL-235B-A22B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_static
+    bits: 8
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "model.visual.patch_embed.proj"
+      - "model.lm_head"
+      - "model.language_model.embed_tokens"
+    quant_vit: false
+
+# Dataset for calibration
+dataset:
+  name: MultiModalDataset
+  data_path: dataset/multimodal_fake_data/fake_data_openai_formate.json
+  max_seq_length: 4096
+  num_samples: 1024
+  batch_size: 1
\ No newline at end of file
diff --git a/configs/qwen3_vl/fp8_static/qwen3_vl-30b_a3b_fp8_static.yaml b/configs/qwen3_vl/fp8_static/qwen3_vl-30b_a3b_fp8_static.yaml
new file mode 100644
index 00000000..12b235fa
--- /dev/null
+++ b/configs/qwen3_vl/fp8_static/qwen3_vl-30b_a3b_fp8_static.yaml
@@ -0,0 +1,36 @@
+# Global configuration of pipeline
+global:
+  save_path: ./output
+
+# Simplified Configuration for LLM compression
+model:
+  name: Qwen3VLMoE
+  model_path: Qwen/Qwen3-VL-30B-A3B-Instruct
+  trust_remote_code: true
+  low_cpu_mem_usage: true
+  use_cache: false
+  torch_dtype: auto
+  device_map: auto
+
+# Compression configuration
+compression:
+  name: PTQ
+  quantization:
+    name: fp8_static
+    bits: 8
+    quant_method:
+      weight: "per-tensor"
+      activation: "per-tensor"
+    ignore_layers:         # Skip quantization for these layers
+      - "model.visual.patch_embed.proj"
+      - "model.lm_head"
+      - "model.language_model.embed_tokens"
+    quant_vit: false
+
+# Dataset for calibration
+dataset:
+  name: MultiModalDataset
+  data_path: dataset/multimodal_fake_data/fake_data_openai_formate.json
+  max_seq_length: 4096
+  num_samples: 1024
+  batch_size: 1
\ No newline at end of file
diff --git a/dataset/multimodal_fake_data/fake_data_openai_formate.json b/dataset/multimodal_fake_data/fake_data_openai_formate.json
new file mode 100644
index 00000000..05d7ffc9
--- /dev/null
+++ b/dataset/multimodal_fake_data/fake_data_openai_formate.json
@@ -0,0 +1,2 @@
+{"id": "0", "messages": [{"role": "user", "content": [{"type": "image", "image": "0.png"}, {"type": "text", "text": "How many food item is shown in the bar graph?"}]}, {"role": "assistant", "content": [{"type": "text", "text": "14"}]}]}
+{"id": "1", "messages": [{"role": "user", "content": [{"type": "image", "image": "1.png"}, {"type": "text", "text": "What is the difference in value between Lamb and Corn?"}]}, {"role": "assistant", "content": [{"type": "text", "text": "0.57"}]}]}
\ No newline at end of file

From 46837de68e8706b0d6a48386f3d4147df2d4178b Mon Sep 17 00:00:00 2001
From: root <root@TENCENT64.site>
Date: Tue, 13 Jan 2026 21:03:25 +0800
Subject: [PATCH 2/4] code refactoring

---
 .../compressor/quant/modules/__init__.py      |   1 +
 .../compressor/quant/modules/helper_layer.py  |  77 +++++++++
 .../compressor/quant/observers/__init__.py    |   1 +
 .../quant/observers/abs_max_activation.py     |  89 ++++++++++
 angelslim/compressor/quant/ptq.py             | 135 +--------------
 angelslim/models/base_model.py                |   3 +
 angelslim/models/vlm/qwen3_vl_moe.py          | 156 +++++++-----------
 7 files changed, 238 insertions(+), 224 deletions(-)

diff --git a/angelslim/compressor/quant/modules/__init__.py b/angelslim/compressor/quant/modules/__init__.py
index 49b41bb2..ac23d34a 100644
--- a/angelslim/compressor/quant/modules/__init__.py
+++ b/angelslim/compressor/quant/modules/__init__.py
@@ -19,6 +19,7 @@
 from .gptq.gptq import GPTQ  # noqa: F401
 from .gptq.gptq_module import GPTQModule  # noqa: F401
 from .helper_layer import GPTQQuantLinear  # noqa: F401
+from .helper_layer import MoEQDQModule  # noqa: F401
 from .helper_layer import NVFP4QDQModule  # noqa: F401
 from .helper_layer import QDQModule  # noqa: F401
 from .helper_layer import QDQSingleModule  # noqa: F401
diff --git a/angelslim/compressor/quant/modules/helper_layer.py b/angelslim/compressor/quant/modules/helper_layer.py
index bb8284d0..c9d6bb4f 100644
--- a/angelslim/compressor/quant/modules/helper_layer.py
+++ b/angelslim/compressor/quant/modules/helper_layer.py
@@ -1023,3 +1023,80 @@ def _unpack_tensor(input: torch.Tensor):
             deq_data.shape[0], deq_data.shape[1] // block_size, -1
         ) * per_block_scale.unsqueeze(-1)
         return deq_data.view(-1)[: np.prod(self.shape)].reshape(self.shape).to(dtype)
+
+
+class MoEQDQModule(torch.nn.Module):
+    def __init__(
+        self,
+        gate_proj: torch.nn.Parameter,
+        up_proj: torch.nn.Parameter,
+        down_proj: torch.nn.Parameter,
+        gate_proj_weight_scale: torch.nn.Parameter,
+        up_proj_weight_scale: torch.nn.Parameter,
+        down_proj_weight_scale: torch.nn.Parameter,
+        gate_up_proj_input_scale: torch.nn.Parameter,
+        down_proj_input_scale: torch.nn.Parameter,
+    ):
+        super().__init__()
+        quant_gate_weight, _ = quantize_weight_per_tensor_fp8(
+            gate_proj, gate_proj_weight_scale
+        )
+        quant_up_weight, _ = quantize_weight_per_tensor_fp8(
+            up_proj, up_proj_weight_scale
+        )
+        quant_down_weight, _ = quantize_weight_per_tensor_fp8(
+            down_proj, down_proj_weight_scale
+        )
+        quant_gate_up_weight = torch.cat([quant_gate_weight, quant_up_weight], dim=-1)
+
+        self.gate_up_proj = torch.nn.Parameter(
+            quant_gate_up_weight, requires_grad=False
+        )
+        self.down_proj = torch.nn.Parameter(quant_down_weight, requires_grad=False)
+
+        gate_proj_weight_scale = (
+            gate_proj_weight_scale.view(-1)
+            if gate_proj_weight_scale.ndim == 0
+            else gate_proj_weight_scale
+        )
+        up_proj_weight_scale = (
+            up_proj_weight_scale.view(-1)
+            if up_proj_weight_scale.ndim == 0
+            else up_proj_weight_scale
+        )
+        down_proj_weight_scale = (
+            down_proj_weight_scale.view(-1)
+            if down_proj_weight_scale.ndim == 0
+            else down_proj_weight_scale
+        )
+        gate_up_proj_weight_scale = torch.cat(
+            [gate_proj_weight_scale, up_proj_weight_scale], dim=-1
+        )
+
+        self.gate_up_proj_weight_scale = torch.nn.Parameter(
+            gate_up_proj_weight_scale, requires_grad=False
+        )
+        self.down_proj_weight_scale = torch.nn.Parameter(
+            down_proj_weight_scale, requires_grad=False
+        )
+
+        down_proj_input_scale = (
+            down_proj_input_scale.view(-1)
+            if down_proj_input_scale.ndim == 0
+            else down_proj_input_scale.squeeze()
+        )
+        gate_up_proj_input_scale = (
+            gate_up_proj_input_scale.view(-1)
+            if gate_up_proj_input_scale.ndim == 0
+            else gate_up_proj_input_scale.squeeze()
+        )
+
+        self.gate_up_proj_input_scale = torch.nn.Parameter(
+            gate_up_proj_input_scale, requires_grad=False
+        )
+        self.down_proj_input_scale = torch.nn.Parameter(
+            down_proj_input_scale, requires_grad=False
+        )
+
+    def forward(self, x):
+        pass
diff --git a/angelslim/compressor/quant/observers/__init__.py b/angelslim/compressor/quant/observers/__init__.py
index 6287a896..bb021db2 100644
--- a/angelslim/compressor/quant/observers/__init__.py
+++ b/angelslim/compressor/quant/observers/__init__.py
@@ -15,6 +15,7 @@
 from .abs_max_activation import AbsmaxPerchannelObserver  # noqa: F401
 from .abs_max_activation import AbsmaxPertensorObserver  # noqa: F401
 from .abs_max_activation import AbsMaxTokenWiseActObserver  # noqa: F401; noqa: F401
+from .abs_max_activation import MoEAbsmaxPertensorObserver  # noqa: F401
 from .abs_max_weight import AbsMaxChannelWiseWeightObserver  # noqa: F401
 from .base_observer import BaseObserver, ParentObserver  # noqa: F401
 from .ema_activation import EMAObserver  # noqa: F401
diff --git a/angelslim/compressor/quant/observers/abs_max_activation.py b/angelslim/compressor/quant/observers/abs_max_activation.py
index 7d390540..51afd2f5 100644
--- a/angelslim/compressor/quant/observers/abs_max_activation.py
+++ b/angelslim/compressor/quant/observers/abs_max_activation.py
@@ -20,6 +20,7 @@
     "AbsmaxPertensorObserver",
     "AbsMaxTokenWiseActObserver",
     "AbsmaxPerchannelObserver",
+    "MoEAbsmaxPertensorObserver",
 ]
 
 
@@ -217,3 +218,91 @@ def zero_points(self):
         if self._zero_point is None:
             self.cal_thresholds()
         return self._zero_point
+
+
+class MoEAbsmaxPertensorObserver(BaseObserver):
+    def __init__(self, layer_name=None, quant_bits=8, **kwargs):
+        super(MoEAbsmaxPertensorObserver, self).__init__(quant_bits=quant_bits)
+        self.layer_name = layer_name
+        self._scale = None
+        self._zero_point = None
+        self._min = None
+        self._max = torch.tensor(1e-7, dtype=torch.float32)
+        self.step = 0
+        self.dtype = None
+        self.parent_observer = (
+            kwargs["parent_observer"]
+            if kwargs and "parent_observer" in kwargs
+            else None
+        )
+
+    def forward(self, inputs):
+        """Calculate forward pass."""
+        self.step += 1
+        if not self.dtype:
+            self.dtype = inputs.dtype
+        if inputs.numel() > 0:
+            self._min, self._max = self._cal_min_max(inputs)
+            if self.parent_observer is not None:
+                self.parent_observer.update(self._min, self._max, self.step)
+        else:
+            assert self.parent_observer is not None
+            self._update_min_max(self.parent_observer.min, self.parent_observer.max)
+        return inputs
+
+    def _cal_min_max(self, inputs):
+        if inputs.dim() >= 2:
+            abs_inputs = torch.abs(inputs)
+            batch_size = abs_inputs.shape[0]
+            abs_inputs_flat = abs_inputs.view(
+                batch_size, -1
+            )  # [batch_size, seq_len * hidden_dim]
+            abs_max_val, _ = torch.max(
+                abs_inputs_flat, dim=1, keepdim=True
+            )  # [batch_size, 1]
+            min_threshold = self._max.to(abs_max_val.device).expand_as(abs_max_val)
+            abs_max_val = torch.maximum(abs_max_val, min_threshold)
+        else:
+            abs_max_val = torch.max(torch.abs(inputs))
+            if abs_max_val.data < self._max.data:
+                abs_max_val = self._max
+            abs_max_val = abs_max_val.unsqueeze(0).unsqueeze(0)  # [1, 1]
+        return 0, abs_max_val.to(inputs.device)
+
+    def _update_min_max(self, min, max):
+        if min is not None and max is not None:
+            if self._min is None or min < self._min:
+                self._min = min
+            if self._max is None or max > self._max:
+                self._max = max
+
+    def cal_thresholds(self):
+        """Compute thresholds for MAX function."""
+        if self._scale is None:
+            self._scale = self._max
+        self._zero_point = torch.zeros_like(self._scale)
+
+    def quant_axis(self):
+        """Return quantization axis."""
+        return -1
+
+    def scales(self):
+        """Return output scales."""
+        if self.step == 0 and self.parent_observer is not None:
+            self._update_min_max(self.parent_observer.min, self.parent_observer.max)
+            self.step = self.parent_observer.step
+        if self.step == 0:
+            raise ValueError(
+                "AbsmaxPertensorObserver scales must calibrate data first!"
+            )
+        if self._scale is None:
+            self.cal_thresholds()
+        if self.dtype:
+            self._scale = self._scale.type(self.dtype)
+        return self._scale
+
+    def zero_points(self):
+        """Return output zero points."""
+        if self._zero_point is None:
+            self.cal_thresholds()
+        return self._zero_point
diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
index 39252b7f..244e98f3 100644
--- a/angelslim/compressor/quant/ptq.py
+++ b/angelslim/compressor/quant/ptq.py
@@ -18,12 +18,6 @@
 
 import torch
 from safetensors.torch import load_file
-from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
-
-from angelslim.compressor.quant.core.quant_func import (
-    get_fp_maxval,
-    quantize_weight_per_tensor_fp8,
-)
 
 from ...utils import find_parent_layer_and_sub_name, print_info
 from ..compressor_factory import CompressorFactory
@@ -291,131 +285,16 @@ def _convert(self):
             if qdq_module is not sub_layer:
                 setattr(parent_layer, sub_name, qdq_module)
 
-        maxval = get_fp_maxval(bits=8)
+        # 3. insert moe qdq module
         for name, sub_layer in self.quant_model.model.named_modules():
-            if isinstance(sub_layer, Qwen3VLMoeTextExperts):
-                parent_layer, sub_name = find_parent_layer_and_sub_name(
-                    quant_convert_module, name
-                )
-                gate_up_act_max = sub_layer.gateupobservers.scales()
-                down_act_max = sub_layer.downobservers.scales()
-                gate_up_act_dtype = gate_up_act_max.dtype
-                down_act_dtype = down_act_max.dtype
-                gate_up_act_scale = gate_up_act_max / maxval.type(gate_up_act_dtype)
-                down_act_scale = down_act_max / maxval.type(down_act_dtype)
-
-                gate_proj, up_proj = sub_layer.gate_up_proj.chunk(2, dim=-1)
-                abs_inputs = torch.abs(gate_proj)
-                batch_size = abs_inputs.shape[0]
-                abs_inputs_flat = abs_inputs.view(batch_size, -1)
-                gate_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
-
-                abs_inputs = torch.abs(up_proj)
-                batch_size = abs_inputs.shape[0]
-                abs_inputs_flat = abs_inputs.view(batch_size, -1)
-                up_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
-
-                abs_inputs = torch.abs(sub_layer.down_proj)
-                batch_size = abs_inputs.shape[0]
-                abs_inputs_flat = abs_inputs.view(batch_size, -1)
-                down_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
-
-                gate_weight_dtype = gate_proj.dtype
-                up_weight_dtype = up_proj.dtype
-                down_weight_dtype = sub_layer.down_proj.dtype
-                gate_weight_scale = gate_weight_max / maxval.type(gate_weight_dtype)
-                up_weight_scale = up_weight_max / maxval.type(up_weight_dtype)
-                down_weight_scale = down_weight_max / maxval.type(down_weight_dtype)
+            parent_layer, sub_name = find_parent_layer_and_sub_name(
+                quant_convert_module, name
+            )
+            moe_qdq_module = self.quant_model.get_moe_qdq_module(sub_layer, name)
+            if moe_qdq_module is not sub_layer:
+                setattr(parent_layer, sub_name, moe_qdq_module)
 
-                q_linear = MyQDQModule(
-                    gate_proj=gate_proj.cpu(),
-                    up_proj=up_proj.cpu(),
-                    down_proj=sub_layer.down_proj.cpu(),
-                    gate_proj_weight_scale=gate_weight_scale.cpu(),
-                    up_proj_weight_scale=up_weight_scale.cpu(),
-                    down_proj_weight_scale=down_weight_scale.cpu(),
-                    gate_up_proj_input_scale=gate_up_act_scale.cpu(),
-                    down_proj_input_scale=down_act_scale.cpu(),
-                )
-                setattr(parent_layer, sub_name, q_linear)
         self.quant_model.quantized = True
 
     def __getattr__(self, item):
         return super().__getattr__(item)
-
-
-class MyQDQModule(torch.nn.Module):
-    def __init__(
-        self,
-        gate_proj: torch.nn.Parameter,
-        up_proj: torch.nn.Parameter,
-        down_proj: torch.nn.Parameter,
-        gate_proj_weight_scale: torch.nn.Parameter,
-        up_proj_weight_scale: torch.nn.Parameter,
-        down_proj_weight_scale: torch.nn.Parameter,
-        gate_up_proj_input_scale: torch.nn.Parameter,
-        down_proj_input_scale: torch.nn.Parameter,
-    ):
-        super().__init__()
-        quant_gate_weight, _ = quantize_weight_per_tensor_fp8(
-            gate_proj, gate_proj_weight_scale
-        )
-        quant_up_weight, _ = quantize_weight_per_tensor_fp8(
-            up_proj, up_proj_weight_scale
-        )
-        quant_down_weight, _ = quantize_weight_per_tensor_fp8(
-            down_proj, down_proj_weight_scale
-        )
-        quant_gate_up_weight = torch.cat([quant_gate_weight, quant_up_weight], dim=-1)
-
-        self.gate_up_proj = torch.nn.Parameter(
-            quant_gate_up_weight, requires_grad=False
-        )
-        self.down_proj = torch.nn.Parameter(quant_down_weight, requires_grad=False)
-
-        gate_proj_weight_scale = (
-            gate_proj_weight_scale.view(-1)
-            if gate_proj_weight_scale.ndim == 0
-            else gate_proj_weight_scale
-        )
-        up_proj_weight_scale = (
-            up_proj_weight_scale.view(-1)
-            if up_proj_weight_scale.ndim == 0
-            else up_proj_weight_scale
-        )
-        down_proj_weight_scale = (
-            down_proj_weight_scale.view(-1)
-            if down_proj_weight_scale.ndim == 0
-            else down_proj_weight_scale
-        )
-        gate_up_proj_weight_scale = torch.cat(
-            [gate_proj_weight_scale, up_proj_weight_scale], dim=-1
-        )
-
-        self.gate_up_proj_weight_scale = torch.nn.Parameter(
-            gate_up_proj_weight_scale, requires_grad=False
-        )
-        self.down_proj_weight_scale = torch.nn.Parameter(
-            down_proj_weight_scale, requires_grad=False
-        )
-
-        down_proj_input_scale = (
-            down_proj_input_scale.view(-1)
-            if down_proj_input_scale.ndim == 0
-            else down_proj_input_scale.squeeze()
-        )
-        gate_up_proj_input_scale = (
-            gate_up_proj_input_scale.view(-1)
-            if gate_up_proj_input_scale.ndim == 0
-            else gate_up_proj_input_scale.squeeze()
-        )
-
-        self.gate_up_proj_input_scale = torch.nn.Parameter(
-            gate_up_proj_input_scale, requires_grad=False
-        )
-        self.down_proj_input_scale = torch.nn.Parameter(
-            down_proj_input_scale, requires_grad=False
-        )
-
-    def forward(self, x):
-        pass
diff --git a/angelslim/models/base_model.py b/angelslim/models/base_model.py
index 17e9316c..99e8c398 100644
--- a/angelslim/models/base_model.py
+++ b/angelslim/models/base_model.py
@@ -147,6 +147,9 @@ def get_qdq_module(self, sub_layer, name):
             raise NotImplementedError
         return q_linear
 
+    def get_moe_qdq_module(self, sub_layer, name):
+        return sub_layer
+
     def get_nvfp4_qdq_module(self, sub_layer, name):
         act_scale, weight_scale, weight_scale_2 = None, None, None
         block_size = self.quant_config.quant_algo_info["block_size"]
diff --git a/angelslim/models/vlm/qwen3_vl_moe.py b/angelslim/models/vlm/qwen3_vl_moe.py
index 5c28f532..c4302e45 100644
--- a/angelslim/models/vlm/qwen3_vl_moe.py
+++ b/angelslim/models/vlm/qwen3_vl_moe.py
@@ -25,15 +25,17 @@
 )
 from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
 
-from angelslim.compressor.quant.observers.base_observer import BaseObserver
+from angelslim.compressor.quant.core.quant_func import get_fp_maxval
+from angelslim.compressor.quant.observers import MoEAbsmaxPertensorObserver
 
 from ...compressor.quant.core import LossFilter, PTQVLMSaveVllmHF
+from ...compressor.quant.modules import MoEQDQModule
 from ...utils import find_layers, print_info
 from ..base_model import BaseLLMModel
 from ..model_factory import SlimModelFactory
 
 
-def observer_forward(
+def moe_observer_forward(
     self,
     hidden_states: torch.Tensor,
     routing_weights: torch.Tensor,
@@ -108,9 +110,6 @@ def observer_forward(
     return next_states
 
 
-Qwen3VLMoeTextExperts.forward = observer_forward
-
-
 @SlimModelFactory.register
 class Qwen3VLMoE(BaseLLMModel):
     def __init__(
@@ -142,7 +141,6 @@ def from_pretrained(
         low_cpu_mem_usage=True,
         use_cache=False,
         using_multi_nodes=False,
-        compress_config=None,
     ):
         self.model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
             model_path,
@@ -162,6 +160,14 @@ def from_pretrained(
             model_path, trust_remote_code=trust_remote_code
         )
 
+    def init_ptq(self, slim_config):
+        for _, module in self.model.named_modules():
+            if isinstance(module, Qwen3VLMoeTextExperts):
+                module.forward = moe_observer_forward.__get__(
+                    module, Qwen3VLMoeTextExperts
+                )
+        super().init_ptq(slim_config)
+
     def get_observer_layers(self):
         names = [
             "self_attn.k_proj",
@@ -189,12 +195,12 @@ def get_observer_layers(self):
             if result == "mlp.experts":
                 if not hasattr(module, "gateupobservers"):
                     layername = name + ".gate_up"
-                    module.gateupobservers = MyAbsmaxPertensorObserver(
+                    module.gateupobservers = MoEAbsmaxPertensorObserver(
                         layer_name=layername
                     )
                 if not hasattr(module, "downobservers"):
                     layername = name + ".down"
-                    module.downobservers = MyAbsmaxPertensorObserver(
+                    module.downobservers = MoEAbsmaxPertensorObserver(
                         layer_name=layername
                     )
             else:
@@ -210,6 +216,52 @@ def get_observer_layers(self):
                         observer_layers_dict.pop(default_name)
         return observer_layers_dict
 
+    def get_moe_qdq_module(self, sub_layer, name):
+        if not isinstance(sub_layer, Qwen3VLMoeTextExperts):
+            return sub_layer
+        maxval = get_fp_maxval(bits=8)
+        gate_up_act_max = sub_layer.gateupobservers.scales()
+        down_act_max = sub_layer.downobservers.scales()
+        gate_up_act_dtype = gate_up_act_max.dtype
+        down_act_dtype = down_act_max.dtype
+        gate_up_act_scale = gate_up_act_max / maxval.type(gate_up_act_dtype)
+        down_act_scale = down_act_max / maxval.type(down_act_dtype)
+
+        gate_proj, up_proj = sub_layer.gate_up_proj.chunk(2, dim=-1)
+        abs_inputs = torch.abs(gate_proj)
+        batch_size = abs_inputs.shape[0]
+        abs_inputs_flat = abs_inputs.view(batch_size, -1)
+        gate_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
+
+        abs_inputs = torch.abs(up_proj)
+        batch_size = abs_inputs.shape[0]
+        abs_inputs_flat = abs_inputs.view(batch_size, -1)
+        up_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
+
+        abs_inputs = torch.abs(sub_layer.down_proj)
+        batch_size = abs_inputs.shape[0]
+        abs_inputs_flat = abs_inputs.view(batch_size, -1)
+        down_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True)
+
+        gate_weight_dtype = gate_proj.dtype
+        up_weight_dtype = up_proj.dtype
+        down_weight_dtype = sub_layer.down_proj.dtype
+        gate_weight_scale = gate_weight_max / maxval.type(gate_weight_dtype)
+        up_weight_scale = up_weight_max / maxval.type(up_weight_dtype)
+        down_weight_scale = down_weight_max / maxval.type(down_weight_dtype)
+
+        q_linear = MoEQDQModule(
+            gate_proj=gate_proj.cpu(),
+            up_proj=up_proj.cpu(),
+            down_proj=sub_layer.down_proj.cpu(),
+            gate_proj_weight_scale=gate_weight_scale.cpu(),
+            up_proj_weight_scale=up_weight_scale.cpu(),
+            down_proj_weight_scale=down_weight_scale.cpu(),
+            gate_up_proj_input_scale=gate_up_act_scale.cpu(),
+            down_proj_input_scale=down_act_scale.cpu(),
+        )
+        return q_linear
+
     def model_forward(self, dataloader, **kwargs):
         self.model.use_cache = False
 
@@ -297,91 +349,3 @@ def get_save_func(self):
             raise NotImplementedError(
                 f"deploy_backend {self.deploy_backend} is not supported for saving."
             )
-
-
-class MyAbsmaxPertensorObserver(BaseObserver):
-    def __init__(self, layer_name=None, quant_bits=8, **kwargs):
-        super(MyAbsmaxPertensorObserver, self).__init__(quant_bits=quant_bits)
-        self.layer_name = layer_name
-        self._scale = None
-        self._zero_point = None
-        self._min = None
-        self._max = torch.tensor(1e-7, dtype=torch.float32)
-        self.step = 0
-        self.dtype = None
-        self.parent_observer = (
-            kwargs["parent_observer"]
-            if kwargs and "parent_observer" in kwargs
-            else None
-        )
-
-    def forward(self, inputs):
-        """Calculate forward pass."""
-        self.step += 1
-        if not self.dtype:
-            self.dtype = inputs.dtype
-        if inputs.numel() > 0:
-            self._min, self._max = self._cal_min_max(inputs)
-            if self.parent_observer is not None:
-                self.parent_observer.update(self._min, self._max, self.step)
-        else:
-            assert self.parent_observer is not None
-            self._update_min_max(self.parent_observer.min, self.parent_observer.max)
-        return inputs
-
-    def _cal_min_max(self, inputs):
-        if inputs.dim() >= 2:
-            abs_inputs = torch.abs(inputs)
-            batch_size = abs_inputs.shape[0]
-            abs_inputs_flat = abs_inputs.view(
-                batch_size, -1
-            )  # [batch_size, seq_len * hidden_dim]
-            abs_max_val, _ = torch.max(
-                abs_inputs_flat, dim=1, keepdim=True
-            )  # [batch_size, 1]
-            min_threshold = self._max.to(abs_max_val.device).expand_as(abs_max_val)
-            abs_max_val = torch.maximum(abs_max_val, min_threshold)
-        else:
-            abs_max_val = torch.max(torch.abs(inputs))
-            if abs_max_val.data < self._max.data:
-                abs_max_val = self._max
-            abs_max_val = abs_max_val.unsqueeze(0).unsqueeze(0)  # [1, 1]
-        return 0, abs_max_val.to(inputs.device)
-
-    def _update_min_max(self, min, max):
-        if min is not None and max is not None:
-            if self._min is None or min < self._min:
-                self._min = min
-            if self._max is None or max > self._max:
-                self._max = max
-
-    def cal_thresholds(self):
-        """Compute thresholds for MAX function."""
-        if self._scale is None:
-            self._scale = self._max
-        self._zero_point = torch.zeros_like(self._scale)
-
-    def quant_axis(self):
-        """Return quantization axis."""
-        return -1
-
-    def scales(self):
-        """Return output scales."""
-        if self.step == 0 and self.parent_observer is not None:
-            self._update_min_max(self.parent_observer.min, self.parent_observer.max)
-            self.step = self.parent_observer.step
-        if self.step == 0:
-            raise ValueError(
-                "AbsmaxPertensorObserver scales must calibrate data first!"
-            )
-        if self._scale is None:
-            self.cal_thresholds()
-        if self.dtype:
-            self._scale = self._scale.type(self.dtype)
-        return self._scale
-
-    def zero_points(self):
-        """Return output zero points."""
-        if self._zero_point is None:
-            self.cal_thresholds()
-        return self._zero_point

From fd9430fac58ef76f8ce6f069a6881afe45a557db Mon Sep 17 00:00:00 2001
From: root <root@TENCENT64.site>
Date: Tue, 13 Jan 2026 21:09:40 +0800
Subject: [PATCH 3/4] fix fp8 quant yaml name

---
 ...35b_a3b_fp8_static.yaml => qwen3_vl-235b_a22b_fp8_static.yaml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename configs/qwen3_vl/fp8_static/{qwen3_vl-235b_a3b_fp8_static.yaml => qwen3_vl-235b_a22b_fp8_static.yaml} (100%)

diff --git a/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml b/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a22b_fp8_static.yaml
similarity index 100%
rename from configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml
rename to configs/qwen3_vl/fp8_static/qwen3_vl-235b_a22b_fp8_static.yaml

From 8fa9b7b4345f6b74905da300e80a5a201ff522a8 Mon Sep 17 00:00:00 2001
From: root <root@TENCENT64.site>
Date: Tue, 13 Jan 2026 21:41:06 +0800
Subject: [PATCH 4/4] update insert moe qdq module

---
 angelslim/compressor/quant/ptq.py    | 19 ++++++++++++-------
 angelslim/models/vlm/qwen3_vl_moe.py |  2 +-
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py
index 244e98f3..281c4bfe 100644
--- a/angelslim/compressor/quant/ptq.py
+++ b/angelslim/compressor/quant/ptq.py
@@ -18,6 +18,7 @@
 
 import torch
 from safetensors.torch import load_file
+from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts
 
 from ...utils import find_parent_layer_and_sub_name, print_info
 from ..compressor_factory import CompressorFactory
@@ -286,13 +287,17 @@ def _convert(self):
                 setattr(parent_layer, sub_name, qdq_module)
 
         # 3. insert moe qdq module
-        for name, sub_layer in self.quant_model.model.named_modules():
-            parent_layer, sub_name = find_parent_layer_and_sub_name(
-                quant_convert_module, name
-            )
-            moe_qdq_module = self.quant_model.get_moe_qdq_module(sub_layer, name)
-            if moe_qdq_module is not sub_layer:
-                setattr(parent_layer, sub_name, moe_qdq_module)
+        # For qwen3_vl_moe models, we need to insert MoEQDQModule for MOE experts,
+        # since these modules contain gate_up_proj and down_proj, which are defined as
+        # nn.Parameters, not nn.Linear.
+        if Qwen3VLMoeTextExperts in self.quant_model.observer_layer_classes:
+            for name, sub_layer in self.quant_model.model.named_modules():
+                parent_layer, sub_name = find_parent_layer_and_sub_name(
+                    quant_convert_module, name
+                )
+                moe_qdq_module = self.quant_model.get_moe_qdq_module(sub_layer, name)
+                if moe_qdq_module is not sub_layer:
+                    setattr(parent_layer, sub_name, moe_qdq_module)
 
         self.quant_model.quantized = True
 
diff --git a/angelslim/models/vlm/qwen3_vl_moe.py b/angelslim/models/vlm/qwen3_vl_moe.py
index c4302e45..470e3430 100644
--- a/angelslim/models/vlm/qwen3_vl_moe.py
+++ b/angelslim/models/vlm/qwen3_vl_moe.py
@@ -56,7 +56,7 @@ def moe_observer_forward(
     Returns:
         torch.Tensor
     """
-    # replace Qwen3VLMoeTextExperts forward function by observer_forward"
+    # replace Qwen3VLMoeTextExperts forward function by moe_observer_forward"
     batch_size = hidden_states.shape[0]
     hidden_states = hidden_states.reshape(
         -1, self.hidden_size