From e3e2b2c2ea0205c9ae4966f688ef82c186d56c87 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 13 Jan 2026 19:10:46 +0800 Subject: [PATCH 1/4] feature: support Qwen3_VL_Moe static quantization --- angelslim/compressor/quant/core/quant_func.py | 7 + angelslim/compressor/quant/ptq.py | 131 ++++++ angelslim/data/multimodal_dataset.py | 95 ++++- angelslim/models/vlm/__init__.py | 1 + angelslim/models/vlm/qwen3_vl_moe.py | 387 ++++++++++++++++++ angelslim/utils/config_parser.py | 2 +- .../qwen3_vl-235b_a3b_fp8_static.yaml | 36 ++ .../qwen3_vl-30b_a3b_fp8_static.yaml | 36 ++ .../fake_data_openai_formate.json | 2 + 9 files changed, 676 insertions(+), 21 deletions(-) create mode 100644 angelslim/models/vlm/qwen3_vl_moe.py create mode 100644 configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml create mode 100644 configs/qwen3_vl/fp8_static/qwen3_vl-30b_a3b_fp8_static.yaml create mode 100644 dataset/multimodal_fake_data/fake_data_openai_formate.json diff --git a/angelslim/compressor/quant/core/quant_func.py b/angelslim/compressor/quant/core/quant_func.py index bb4ea8d3..18280189 100644 --- a/angelslim/compressor/quant/core/quant_func.py +++ b/angelslim/compressor/quant/core/quant_func.py @@ -73,10 +73,17 @@ def quantize_weight_per_tensor_fp8( ) -> Tuple[torch.Tensor, float]: finfo = torch.finfo(torch.float8_e4m3fn) + squeeze_dim = False + while scale.ndim < tensor.ndim: + scale = scale.unsqueeze(-1) + squeeze_dim = True + qweight = (tensor / scale).clamp(min=finfo.min, max=finfo.max) # Return both float8 data and the inverse scale (as float), # as both required as inputs to torch._scaled_mm qweight = qweight.to(torch.float8_e4m3fn) + if squeeze_dim: + scale = scale.squeeze(-1) scale = scale.float() return qweight, scale diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index 8aabb1f2..39252b7f 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -18,6 +18,12 @@ import torch from safetensors.torch import load_file +from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts + +from angelslim.compressor.quant.core.quant_func import ( + get_fp_maxval, + quantize_weight_per_tensor_fp8, +) from ...utils import find_parent_layer_and_sub_name, print_info from ..compressor_factory import CompressorFactory @@ -284,7 +290,132 @@ def _convert(self): if qdq_module is not sub_layer: setattr(parent_layer, sub_name, qdq_module) + + maxval = get_fp_maxval(bits=8) + for name, sub_layer in self.quant_model.model.named_modules(): + if isinstance(sub_layer, Qwen3VLMoeTextExperts): + parent_layer, sub_name = find_parent_layer_and_sub_name( + quant_convert_module, name + ) + gate_up_act_max = sub_layer.gateupobservers.scales() + down_act_max = sub_layer.downobservers.scales() + gate_up_act_dtype = gate_up_act_max.dtype + down_act_dtype = down_act_max.dtype + gate_up_act_scale = gate_up_act_max / maxval.type(gate_up_act_dtype) + down_act_scale = down_act_max / maxval.type(down_act_dtype) + + gate_proj, up_proj = sub_layer.gate_up_proj.chunk(2, dim=-1) + abs_inputs = torch.abs(gate_proj) + batch_size = abs_inputs.shape[0] + abs_inputs_flat = abs_inputs.view(batch_size, -1) + gate_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) + + abs_inputs = torch.abs(up_proj) + batch_size = abs_inputs.shape[0] + abs_inputs_flat = abs_inputs.view(batch_size, -1) + up_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) + + abs_inputs = torch.abs(sub_layer.down_proj) + batch_size = abs_inputs.shape[0] + abs_inputs_flat = abs_inputs.view(batch_size, -1) + down_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) + + gate_weight_dtype = gate_proj.dtype + up_weight_dtype = up_proj.dtype + down_weight_dtype = sub_layer.down_proj.dtype + gate_weight_scale = gate_weight_max / maxval.type(gate_weight_dtype) + up_weight_scale = up_weight_max / maxval.type(up_weight_dtype) + down_weight_scale = down_weight_max / maxval.type(down_weight_dtype) + + q_linear = MyQDQModule( + gate_proj=gate_proj.cpu(), + up_proj=up_proj.cpu(), + down_proj=sub_layer.down_proj.cpu(), + gate_proj_weight_scale=gate_weight_scale.cpu(), + up_proj_weight_scale=up_weight_scale.cpu(), + down_proj_weight_scale=down_weight_scale.cpu(), + gate_up_proj_input_scale=gate_up_act_scale.cpu(), + down_proj_input_scale=down_act_scale.cpu(), + ) + setattr(parent_layer, sub_name, q_linear) self.quant_model.quantized = True def __getattr__(self, item): return super().__getattr__(item) + + +class MyQDQModule(torch.nn.Module): + def __init__( + self, + gate_proj: torch.nn.Parameter, + up_proj: torch.nn.Parameter, + down_proj: torch.nn.Parameter, + gate_proj_weight_scale: torch.nn.Parameter, + up_proj_weight_scale: torch.nn.Parameter, + down_proj_weight_scale: torch.nn.Parameter, + gate_up_proj_input_scale: torch.nn.Parameter, + down_proj_input_scale: torch.nn.Parameter, + ): + super().__init__() + quant_gate_weight, _ = quantize_weight_per_tensor_fp8( + gate_proj, gate_proj_weight_scale + ) + quant_up_weight, _ = quantize_weight_per_tensor_fp8( + up_proj, up_proj_weight_scale + ) + quant_down_weight, _ = quantize_weight_per_tensor_fp8( + down_proj, down_proj_weight_scale + ) + quant_gate_up_weight = torch.cat([quant_gate_weight, quant_up_weight], dim=-1) + + self.gate_up_proj = torch.nn.Parameter( + quant_gate_up_weight, requires_grad=False + ) + self.down_proj = torch.nn.Parameter(quant_down_weight, requires_grad=False) + + gate_proj_weight_scale = ( + gate_proj_weight_scale.view(-1) + if gate_proj_weight_scale.ndim == 0 + else gate_proj_weight_scale + ) + up_proj_weight_scale = ( + up_proj_weight_scale.view(-1) + if up_proj_weight_scale.ndim == 0 + else up_proj_weight_scale + ) + down_proj_weight_scale = ( + down_proj_weight_scale.view(-1) + if down_proj_weight_scale.ndim == 0 + else down_proj_weight_scale + ) + gate_up_proj_weight_scale = torch.cat( + [gate_proj_weight_scale, up_proj_weight_scale], dim=-1 + ) + + self.gate_up_proj_weight_scale = torch.nn.Parameter( + gate_up_proj_weight_scale, requires_grad=False + ) + self.down_proj_weight_scale = torch.nn.Parameter( + down_proj_weight_scale, requires_grad=False + ) + + down_proj_input_scale = ( + down_proj_input_scale.view(-1) + if down_proj_input_scale.ndim == 0 + else down_proj_input_scale.squeeze() + ) + gate_up_proj_input_scale = ( + gate_up_proj_input_scale.view(-1) + if gate_up_proj_input_scale.ndim == 0 + else gate_up_proj_input_scale.squeeze() + ) + + self.gate_up_proj_input_scale = torch.nn.Parameter( + gate_up_proj_input_scale, requires_grad=False + ) + self.down_proj_input_scale = torch.nn.Parameter( + down_proj_input_scale, requires_grad=False + ) + + def forward(self, x): + pass diff --git a/angelslim/data/multimodal_dataset.py b/angelslim/data/multimodal_dataset.py index a51a6ccb..c6e80ea5 100644 --- a/angelslim/data/multimodal_dataset.py +++ b/angelslim/data/multimodal_dataset.py @@ -49,7 +49,7 @@ def __init__( def _load_file_based_dataset(self, data_path: str, num_samples: int): """Load dataset from local file system""" - image_dir = os.path.join(os.path.dirname(data_path), "images") + self.data_path = data_path line_count = 0 with open(data_path, "r") as f: @@ -58,29 +58,84 @@ def _load_file_based_dataset(self, data_path: str, num_samples: int): break data = json.loads(line.strip()) - image_path = os.path.join(image_dir, data["img_path"]) - # Prepare chat messages with image - messages = [ - { - "role": "user", - "content": [ - {"type": "image", "image": image_path}, - { - "type": "text", - "text": data["question"].replace("", ""), - }, - ], - }, - { - "role": "assistant", - "content": [{"type": "text", "text": data["answer"]}], - }, - ] + # Validate format + assert "messages" in data or "question" in data, "JSON format error" + + # Prepare messages + messages = self._prepare_messages(data) self._process_and_append(messages) line_count += 1 + def _prepare_messages(self, data: Dict) -> List[Dict]: + image_dir = os.path.join(os.path.dirname(self.data_path), "images") + if "question" in data: + # Prepare chat messages with image + messages = [] + if "system_prompt" in data: + messages.extend( + [ + { + "role": "system", + "content": [ + {"type": "text", "text": data["system_prompt"]} + ], + } + ] + ) + if "img_path" in data: + image_path = os.path.join(image_dir, data["img_path"]) + messages.extend( + [ + { + "role": "user", + "content": [ + {"type": "image", "image": image_path}, + { + "type": "text", + "text": data["question"].replace("", ""), + }, + ], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": data["answer"]}], + }, + ] + ) + else: + messages.extend( + [ + { + "role": "user", + "content": [ + {"type": "text", "text": data["question"]}, + ], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": data["answer"]}], + }, + ] + ) + elif "messages" in data: + messages = data["messages"] + for message in messages: + if message["role"] == "user": + for content in message["content"]: + if content["type"] == "image": + content["image"] = os.path.join(image_dir, content["image"]) + else: + raise ValueError("Invalid data format") + + # adapt to hunyuan_vl + if self.model_name in ["HunyuanVL"]: + for message in messages: + if message["role"] == "assistant" or message["role"] == "system": + message["content"] = message["content"][0]["text"] + return messages + def _load_hf_dataset(self, dataset: str, num_samples: int): """Load dataset from Hugging Face format""" dataset = load_dataset(dataset, split="test") @@ -108,7 +163,7 @@ def _load_hf_dataset(self, dataset: str, num_samples: int): def _process_and_append(self, messages: List[Dict]): """Process messages and append to dataset""" - if self.model_name in ["Qwen3VL"]: + if self.model_name in ["Qwen3VL", "Qwen3VLMoE"]: inputs = self.processor.apply_chat_template( messages, tokenize=True, diff --git a/angelslim/models/vlm/__init__.py b/angelslim/models/vlm/__init__.py index 6eccd510..ce45e949 100644 --- a/angelslim/models/vlm/__init__.py +++ b/angelslim/models/vlm/__init__.py @@ -14,4 +14,5 @@ from .hunyuan_vl import HunyuanVL # noqa: F401 from .qwen3_vl import Qwen3VL # noqa: F401 +from .qwen3_vl_moe import Qwen3VLMoE # noqa: F401 from .qwen_vl import QwenVL # noqa: F401 diff --git a/angelslim/models/vlm/qwen3_vl_moe.py b/angelslim/models/vlm/qwen3_vl_moe.py new file mode 100644 index 00000000..5c28f532 --- /dev/null +++ b/angelslim/models/vlm/qwen3_vl_moe.py @@ -0,0 +1,387 @@ +# Copyright 2025 Tencent Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc + +import torch +import torch.nn as nn +import torch.nn.functional as F +from tqdm import tqdm +from transformers import ( + AutoProcessor, + AutoTokenizer, + Qwen3VLMoeForConditionalGeneration, +) +from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts + +from angelslim.compressor.quant.observers.base_observer import BaseObserver + +from ...compressor.quant.core import LossFilter, PTQVLMSaveVllmHF +from ...utils import find_layers, print_info +from ..base_model import BaseLLMModel +from ..model_factory import SlimModelFactory + + +def observer_forward( + self, + hidden_states: torch.Tensor, + routing_weights: torch.Tensor, + router_indices: torch.Tensor, +) -> torch.Tensor: + """ + When training it is more efficient to just loop over the experts and + compute the output for each expert + as otherwise the memory would explode. + + For inference we can sacrifice some memory and compute the output for + all experts at once. By repeating the inputs. + + Args: + hidden_states (torch.Tensor): (batch_size * token_num, hidden_size) + routing_weights (torch.Tensor): (batch_size * token_num, num_experts) + router_indices (torch.Tensor): (batch_size * token_num, top_k) + Returns: + torch.Tensor + """ + # replace Qwen3VLMoeTextExperts forward function by observer_forward" + batch_size = hidden_states.shape[0] + hidden_states = hidden_states.reshape( + -1, self.hidden_size + ) # (num_tokens, hidden_size) + if self.training: + next_states = torch.zeros_like( + hidden_states, dtype=hidden_states.dtype, device=hidden_states.device + ) + with torch.no_grad(): + expert_mask = torch.nn.functional.one_hot( + router_indices, num_classes=self.num_experts + ) + expert_mask = expert_mask.permute(2, 1, 0) + # we sum on the top_k and on the sequence length to get which experts + # are hit this time around + expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero() + for expert_idx in expert_hit[:]: + with torch.no_grad(): + _, token_idx = torch.where(expert_mask[expert_idx[0]]) + current_state = hidden_states[token_idx] + gate_up = current_state @ self.gate_up_proj[expert_idx] + gate, up = gate_up.chunk(2, dim=-1) + gated_output = up * self.act_fn(gate) + out = gated_output @ self.down_proj[expert_idx] + weighted_output = out[0] * routing_weights[token_idx, expert_idx, None] + next_states.index_add_( + 0, token_idx, weighted_output.to(hidden_states.dtype) + ) + next_states = next_states.view(batch_size, -1, self.hidden_size) + else: + hidden_states = hidden_states.repeat(self.num_experts, 1) + hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size) + if self.gateupobservers: + self.gateupobservers(hidden_states) + gate_up = torch.bmm(hidden_states, self.gate_up_proj) + gate, up = gate_up.chunk(2, dim=-1) # not supported for DTensors + if self.downobservers: + down_input = up * self.act_fn(gate) + self.downobservers(down_input) + next_states = torch.bmm((up * self.act_fn(gate)), self.down_proj) + next_states = next_states.reshape( + self.num_experts, batch_size, -1, self.hidden_size + ) + next_states = ( + next_states + * routing_weights.transpose(0, 1).view(self.num_experts, batch_size, -1)[ + ..., None + ] + ) + next_states = next_states.sum(dim=0) + return next_states + + +Qwen3VLMoeTextExperts.forward = observer_forward + + +@SlimModelFactory.register +class Qwen3VLMoE(BaseLLMModel): + def __init__( + self, + model=None, + deploy_backend="vllm", + ): + super().__init__( + model=model, + deploy_backend=deploy_backend, + ) + self.modal_type = "VLM" + self.block_name = "model.language_model.layers" + self.vit_block_name = "model.visual.blocks" + self.pre_transformer_module_names = [ + "visual", + "language_model.embed_tokens", + "language_model.norm", + "language_model.rotary_emb", + ] + self.observer_layer_classes = [nn.Linear, Qwen3VLMoeTextExperts] + + def from_pretrained( + self, + model_path, + torch_dtype="auto", + device_map="auto", + trust_remote_code=True, + low_cpu_mem_usage=True, + use_cache=False, + using_multi_nodes=False, + compress_config=None, + ): + self.model = Qwen3VLMoeForConditionalGeneration.from_pretrained( + model_path, + torch_dtype=torch_dtype, + device_map=device_map, + trust_remote_code=trust_remote_code, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + + # Load tokenizer + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) + + # Load processor + self.processor = AutoProcessor.from_pretrained( + model_path, trust_remote_code=trust_remote_code + ) + + def get_observer_layers(self): + names = [ + "self_attn.k_proj", + "self_attn.v_proj", + "self_attn.q_proj", + "self_attn.o_proj", + ] + + if hasattr(self.quant_config, "quant_vit") and self.quant_config.quant_vit: + vit_names = ["attn.qkv", "attn.proj", "mlp.linear_fc1", "mlp.linear_fc2"] + names.extend(vit_names) + + observer_layers_dict = {} + layers_dict = find_layers(self.model, layers=self.observer_layer_classes) + + ignore_layers = self.skip_layer_names() + for name, module in layers_dict.items(): + block_condition = name.startswith(self.block_name) or ( + hasattr(self.quant_config, "quant_vit") + and self.quant_config.quant_vit + and name.startswith(self.vit_block_name) + ) + parts = name.split(".") + result = ".".join(parts[-2:]) + if result == "mlp.experts": + if not hasattr(module, "gateupobservers"): + layername = name + ".gate_up" + module.gateupobservers = MyAbsmaxPertensorObserver( + layer_name=layername + ) + if not hasattr(module, "downobservers"): + layername = name + ".down" + module.downobservers = MyAbsmaxPertensorObserver( + layer_name=layername + ) + else: + if block_condition and result in names: + observer_layers_dict[name] = module + else: + ignore_layers.append(name) + self.quant_config.quant_algo_info["ignore_layers"] = ignore_layers + if self.quant_config.custom_observe_layers_names != "default": + for custom_observe_name in self.quant_config.custom_observe_layers_names: + for default_name in observer_layers_dict.keys(): + if custom_observe_name not in default_name: + observer_layers_dict.pop(default_name) + return observer_layers_dict + + def model_forward(self, dataloader, **kwargs): + self.model.use_cache = False + + calibrated_cnt = 0 + if ( + "gptq" in self.quant_config.quant_algo + or "awq" in self.quant_config.quant_algo + or "gptaq" in self.quant_config.quant_algo + ): + device = "cuda:0" + else: + device = self.model.device + print_info(f"device is {device}") + if dataloader is not None: + loss_filter = LossFilter(processor=self.processor) + with torch.no_grad(): + for batch in tqdm( + dataloader, desc="calibrating...", total=len(dataloader) + ): + if "pixel_values" in batch: + inputs = { + "input_ids": batch["input_ids"].to(device), + "attention_mask": batch["attention_mask"].to(device), + "pixel_values": batch["pixel_values"].to(device), + "image_grid_thw": batch["image_grid_thw"].to(device), + } + else: + inputs = { + "input_ids": batch["input_ids"].to(device), + "attention_mask": batch["attention_mask"].to(device), + } + + inputs = {k: v.to(device) for k, v in inputs.items()} + inputs["use_cache"] = False + labels = batch["labels"].to(device) + attention_mask = batch["attention_mask"].to(device) + try: + outputs = self.model(**inputs) + logits = outputs.logits.float() + + loss = F.cross_entropy( + logits.view(-1, logits.size(-1)), + labels.view(-1), + reduction="none", + ) + + attention_mask = ( + attention_mask.view(-1).to(logits.device).float() + ) + loss = loss * attention_mask + loss = loss_filter.filter_loss( + loss=loss, labels=labels, model_type="Qwen3VL" + ) + avg_loss = loss.mean() + ppl = torch.exp(avg_loss) + + print_info(f"ppl is : {ppl:.4f}") + + calibrated_cnt += 1 + except ValueError: + calibrated_cnt += 1 + pass + inputs = { + k: v.to("cpu") if isinstance(v, torch.Tensor) else v + for k, v in inputs.items() + } + attention_mask = attention_mask.to("cpu") + labels = labels.to("cpu") + del outputs, inputs + torch.cuda.synchronize() + torch.cuda.empty_cache() + gc.collect() + + def get_quant_module(self): + """ + Returns the module that will be quantized. + This is typically the main transformer module of the model. + """ + return self.model.model.language_model.layers + + def get_save_func(self): + if self.deploy_backend in ["vllm", "huggingface"]: + return PTQVLMSaveVllmHF + else: + raise NotImplementedError( + f"deploy_backend {self.deploy_backend} is not supported for saving." + ) + + +class MyAbsmaxPertensorObserver(BaseObserver): + def __init__(self, layer_name=None, quant_bits=8, **kwargs): + super(MyAbsmaxPertensorObserver, self).__init__(quant_bits=quant_bits) + self.layer_name = layer_name + self._scale = None + self._zero_point = None + self._min = None + self._max = torch.tensor(1e-7, dtype=torch.float32) + self.step = 0 + self.dtype = None + self.parent_observer = ( + kwargs["parent_observer"] + if kwargs and "parent_observer" in kwargs + else None + ) + + def forward(self, inputs): + """Calculate forward pass.""" + self.step += 1 + if not self.dtype: + self.dtype = inputs.dtype + if inputs.numel() > 0: + self._min, self._max = self._cal_min_max(inputs) + if self.parent_observer is not None: + self.parent_observer.update(self._min, self._max, self.step) + else: + assert self.parent_observer is not None + self._update_min_max(self.parent_observer.min, self.parent_observer.max) + return inputs + + def _cal_min_max(self, inputs): + if inputs.dim() >= 2: + abs_inputs = torch.abs(inputs) + batch_size = abs_inputs.shape[0] + abs_inputs_flat = abs_inputs.view( + batch_size, -1 + ) # [batch_size, seq_len * hidden_dim] + abs_max_val, _ = torch.max( + abs_inputs_flat, dim=1, keepdim=True + ) # [batch_size, 1] + min_threshold = self._max.to(abs_max_val.device).expand_as(abs_max_val) + abs_max_val = torch.maximum(abs_max_val, min_threshold) + else: + abs_max_val = torch.max(torch.abs(inputs)) + if abs_max_val.data < self._max.data: + abs_max_val = self._max + abs_max_val = abs_max_val.unsqueeze(0).unsqueeze(0) # [1, 1] + return 0, abs_max_val.to(inputs.device) + + def _update_min_max(self, min, max): + if min is not None and max is not None: + if self._min is None or min < self._min: + self._min = min + if self._max is None or max > self._max: + self._max = max + + def cal_thresholds(self): + """Compute thresholds for MAX function.""" + if self._scale is None: + self._scale = self._max + self._zero_point = torch.zeros_like(self._scale) + + def quant_axis(self): + """Return quantization axis.""" + return -1 + + def scales(self): + """Return output scales.""" + if self.step == 0 and self.parent_observer is not None: + self._update_min_max(self.parent_observer.min, self.parent_observer.max) + self.step = self.parent_observer.step + if self.step == 0: + raise ValueError( + "AbsmaxPertensorObserver scales must calibrate data first!" + ) + if self._scale is None: + self.cal_thresholds() + if self.dtype: + self._scale = self._scale.type(self.dtype) + return self._scale + + def zero_points(self): + """Return output zero points.""" + if self._zero_point is None: + self.cal_thresholds() + return self._zero_point diff --git a/angelslim/utils/config_parser.py b/angelslim/utils/config_parser.py index 5c048f73..8acb50cf 100644 --- a/angelslim/utils/config_parser.py +++ b/angelslim/utils/config_parser.py @@ -92,7 +92,7 @@ def get_max_seq_length(self) -> int: def set_model_hidden_size(self, model_path) -> int: json_data = get_hf_config(model_path) try: - if json_data["model_type"] in ["qwen3_vl"]: + if json_data["model_type"] in ["qwen3_vl", "qwen3_vl_moe"]: self.hidden_size = json_data["text_config"]["hidden_size"] elif ( json_data["architectures"][0] diff --git a/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml b/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml new file mode 100644 index 00000000..f1251767 --- /dev/null +++ b/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml @@ -0,0 +1,36 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen3VLMoE + model_path: Qwen/Qwen3-VL-235B-A22B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_static + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "model.visual.patch_embed.proj" + - "model.lm_head" + - "model.language_model.embed_tokens" + quant_vit: false + +# Dataset for calibration +dataset: + name: MultiModalDataset + data_path: dataset/multimodal_fake_data/fake_data_openai_formate.json + max_seq_length: 4096 + num_samples: 1024 + batch_size: 1 \ No newline at end of file diff --git a/configs/qwen3_vl/fp8_static/qwen3_vl-30b_a3b_fp8_static.yaml b/configs/qwen3_vl/fp8_static/qwen3_vl-30b_a3b_fp8_static.yaml new file mode 100644 index 00000000..12b235fa --- /dev/null +++ b/configs/qwen3_vl/fp8_static/qwen3_vl-30b_a3b_fp8_static.yaml @@ -0,0 +1,36 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen3VLMoE + model_path: Qwen/Qwen3-VL-30B-A3B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_static + bits: 8 + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "model.visual.patch_embed.proj" + - "model.lm_head" + - "model.language_model.embed_tokens" + quant_vit: false + +# Dataset for calibration +dataset: + name: MultiModalDataset + data_path: dataset/multimodal_fake_data/fake_data_openai_formate.json + max_seq_length: 4096 + num_samples: 1024 + batch_size: 1 \ No newline at end of file diff --git a/dataset/multimodal_fake_data/fake_data_openai_formate.json b/dataset/multimodal_fake_data/fake_data_openai_formate.json new file mode 100644 index 00000000..05d7ffc9 --- /dev/null +++ b/dataset/multimodal_fake_data/fake_data_openai_formate.json @@ -0,0 +1,2 @@ +{"id": "0", "messages": [{"role": "user", "content": [{"type": "image", "image": "0.png"}, {"type": "text", "text": "How many food item is shown in the bar graph?"}]}, {"role": "assistant", "content": [{"type": "text", "text": "14"}]}]} +{"id": "1", "messages": [{"role": "user", "content": [{"type": "image", "image": "1.png"}, {"type": "text", "text": "What is the difference in value between Lamb and Corn?"}]}, {"role": "assistant", "content": [{"type": "text", "text": "0.57"}]}]} \ No newline at end of file From 46837de68e8706b0d6a48386f3d4147df2d4178b Mon Sep 17 00:00:00 2001 From: root Date: Tue, 13 Jan 2026 21:03:25 +0800 Subject: [PATCH 2/4] code refactoring --- .../compressor/quant/modules/__init__.py | 1 + .../compressor/quant/modules/helper_layer.py | 77 +++++++++ .../compressor/quant/observers/__init__.py | 1 + .../quant/observers/abs_max_activation.py | 89 ++++++++++ angelslim/compressor/quant/ptq.py | 135 +-------------- angelslim/models/base_model.py | 3 + angelslim/models/vlm/qwen3_vl_moe.py | 156 +++++++----------- 7 files changed, 238 insertions(+), 224 deletions(-) diff --git a/angelslim/compressor/quant/modules/__init__.py b/angelslim/compressor/quant/modules/__init__.py index 49b41bb2..ac23d34a 100644 --- a/angelslim/compressor/quant/modules/__init__.py +++ b/angelslim/compressor/quant/modules/__init__.py @@ -19,6 +19,7 @@ from .gptq.gptq import GPTQ # noqa: F401 from .gptq.gptq_module import GPTQModule # noqa: F401 from .helper_layer import GPTQQuantLinear # noqa: F401 +from .helper_layer import MoEQDQModule # noqa: F401 from .helper_layer import NVFP4QDQModule # noqa: F401 from .helper_layer import QDQModule # noqa: F401 from .helper_layer import QDQSingleModule # noqa: F401 diff --git a/angelslim/compressor/quant/modules/helper_layer.py b/angelslim/compressor/quant/modules/helper_layer.py index bb8284d0..c9d6bb4f 100644 --- a/angelslim/compressor/quant/modules/helper_layer.py +++ b/angelslim/compressor/quant/modules/helper_layer.py @@ -1023,3 +1023,80 @@ def _unpack_tensor(input: torch.Tensor): deq_data.shape[0], deq_data.shape[1] // block_size, -1 ) * per_block_scale.unsqueeze(-1) return deq_data.view(-1)[: np.prod(self.shape)].reshape(self.shape).to(dtype) + + +class MoEQDQModule(torch.nn.Module): + def __init__( + self, + gate_proj: torch.nn.Parameter, + up_proj: torch.nn.Parameter, + down_proj: torch.nn.Parameter, + gate_proj_weight_scale: torch.nn.Parameter, + up_proj_weight_scale: torch.nn.Parameter, + down_proj_weight_scale: torch.nn.Parameter, + gate_up_proj_input_scale: torch.nn.Parameter, + down_proj_input_scale: torch.nn.Parameter, + ): + super().__init__() + quant_gate_weight, _ = quantize_weight_per_tensor_fp8( + gate_proj, gate_proj_weight_scale + ) + quant_up_weight, _ = quantize_weight_per_tensor_fp8( + up_proj, up_proj_weight_scale + ) + quant_down_weight, _ = quantize_weight_per_tensor_fp8( + down_proj, down_proj_weight_scale + ) + quant_gate_up_weight = torch.cat([quant_gate_weight, quant_up_weight], dim=-1) + + self.gate_up_proj = torch.nn.Parameter( + quant_gate_up_weight, requires_grad=False + ) + self.down_proj = torch.nn.Parameter(quant_down_weight, requires_grad=False) + + gate_proj_weight_scale = ( + gate_proj_weight_scale.view(-1) + if gate_proj_weight_scale.ndim == 0 + else gate_proj_weight_scale + ) + up_proj_weight_scale = ( + up_proj_weight_scale.view(-1) + if up_proj_weight_scale.ndim == 0 + else up_proj_weight_scale + ) + down_proj_weight_scale = ( + down_proj_weight_scale.view(-1) + if down_proj_weight_scale.ndim == 0 + else down_proj_weight_scale + ) + gate_up_proj_weight_scale = torch.cat( + [gate_proj_weight_scale, up_proj_weight_scale], dim=-1 + ) + + self.gate_up_proj_weight_scale = torch.nn.Parameter( + gate_up_proj_weight_scale, requires_grad=False + ) + self.down_proj_weight_scale = torch.nn.Parameter( + down_proj_weight_scale, requires_grad=False + ) + + down_proj_input_scale = ( + down_proj_input_scale.view(-1) + if down_proj_input_scale.ndim == 0 + else down_proj_input_scale.squeeze() + ) + gate_up_proj_input_scale = ( + gate_up_proj_input_scale.view(-1) + if gate_up_proj_input_scale.ndim == 0 + else gate_up_proj_input_scale.squeeze() + ) + + self.gate_up_proj_input_scale = torch.nn.Parameter( + gate_up_proj_input_scale, requires_grad=False + ) + self.down_proj_input_scale = torch.nn.Parameter( + down_proj_input_scale, requires_grad=False + ) + + def forward(self, x): + pass diff --git a/angelslim/compressor/quant/observers/__init__.py b/angelslim/compressor/quant/observers/__init__.py index 6287a896..bb021db2 100644 --- a/angelslim/compressor/quant/observers/__init__.py +++ b/angelslim/compressor/quant/observers/__init__.py @@ -15,6 +15,7 @@ from .abs_max_activation import AbsmaxPerchannelObserver # noqa: F401 from .abs_max_activation import AbsmaxPertensorObserver # noqa: F401 from .abs_max_activation import AbsMaxTokenWiseActObserver # noqa: F401; noqa: F401 +from .abs_max_activation import MoEAbsmaxPertensorObserver # noqa: F401 from .abs_max_weight import AbsMaxChannelWiseWeightObserver # noqa: F401 from .base_observer import BaseObserver, ParentObserver # noqa: F401 from .ema_activation import EMAObserver # noqa: F401 diff --git a/angelslim/compressor/quant/observers/abs_max_activation.py b/angelslim/compressor/quant/observers/abs_max_activation.py index 7d390540..51afd2f5 100644 --- a/angelslim/compressor/quant/observers/abs_max_activation.py +++ b/angelslim/compressor/quant/observers/abs_max_activation.py @@ -20,6 +20,7 @@ "AbsmaxPertensorObserver", "AbsMaxTokenWiseActObserver", "AbsmaxPerchannelObserver", + "MoEAbsmaxPertensorObserver", ] @@ -217,3 +218,91 @@ def zero_points(self): if self._zero_point is None: self.cal_thresholds() return self._zero_point + + +class MoEAbsmaxPertensorObserver(BaseObserver): + def __init__(self, layer_name=None, quant_bits=8, **kwargs): + super(MoEAbsmaxPertensorObserver, self).__init__(quant_bits=quant_bits) + self.layer_name = layer_name + self._scale = None + self._zero_point = None + self._min = None + self._max = torch.tensor(1e-7, dtype=torch.float32) + self.step = 0 + self.dtype = None + self.parent_observer = ( + kwargs["parent_observer"] + if kwargs and "parent_observer" in kwargs + else None + ) + + def forward(self, inputs): + """Calculate forward pass.""" + self.step += 1 + if not self.dtype: + self.dtype = inputs.dtype + if inputs.numel() > 0: + self._min, self._max = self._cal_min_max(inputs) + if self.parent_observer is not None: + self.parent_observer.update(self._min, self._max, self.step) + else: + assert self.parent_observer is not None + self._update_min_max(self.parent_observer.min, self.parent_observer.max) + return inputs + + def _cal_min_max(self, inputs): + if inputs.dim() >= 2: + abs_inputs = torch.abs(inputs) + batch_size = abs_inputs.shape[0] + abs_inputs_flat = abs_inputs.view( + batch_size, -1 + ) # [batch_size, seq_len * hidden_dim] + abs_max_val, _ = torch.max( + abs_inputs_flat, dim=1, keepdim=True + ) # [batch_size, 1] + min_threshold = self._max.to(abs_max_val.device).expand_as(abs_max_val) + abs_max_val = torch.maximum(abs_max_val, min_threshold) + else: + abs_max_val = torch.max(torch.abs(inputs)) + if abs_max_val.data < self._max.data: + abs_max_val = self._max + abs_max_val = abs_max_val.unsqueeze(0).unsqueeze(0) # [1, 1] + return 0, abs_max_val.to(inputs.device) + + def _update_min_max(self, min, max): + if min is not None and max is not None: + if self._min is None or min < self._min: + self._min = min + if self._max is None or max > self._max: + self._max = max + + def cal_thresholds(self): + """Compute thresholds for MAX function.""" + if self._scale is None: + self._scale = self._max + self._zero_point = torch.zeros_like(self._scale) + + def quant_axis(self): + """Return quantization axis.""" + return -1 + + def scales(self): + """Return output scales.""" + if self.step == 0 and self.parent_observer is not None: + self._update_min_max(self.parent_observer.min, self.parent_observer.max) + self.step = self.parent_observer.step + if self.step == 0: + raise ValueError( + "AbsmaxPertensorObserver scales must calibrate data first!" + ) + if self._scale is None: + self.cal_thresholds() + if self.dtype: + self._scale = self._scale.type(self.dtype) + return self._scale + + def zero_points(self): + """Return output zero points.""" + if self._zero_point is None: + self.cal_thresholds() + return self._zero_point diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index 39252b7f..244e98f3 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -18,12 +18,6 @@ import torch from safetensors.torch import load_file -from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts - -from angelslim.compressor.quant.core.quant_func import ( - get_fp_maxval, - quantize_weight_per_tensor_fp8, -) from ...utils import find_parent_layer_and_sub_name, print_info from ..compressor_factory import CompressorFactory @@ -291,131 +285,16 @@ def _convert(self): if qdq_module is not sub_layer: setattr(parent_layer, sub_name, qdq_module) - maxval = get_fp_maxval(bits=8) + # 3. insert moe qdq module for name, sub_layer in self.quant_model.model.named_modules(): - if isinstance(sub_layer, Qwen3VLMoeTextExperts): - parent_layer, sub_name = find_parent_layer_and_sub_name( - quant_convert_module, name - ) - gate_up_act_max = sub_layer.gateupobservers.scales() - down_act_max = sub_layer.downobservers.scales() - gate_up_act_dtype = gate_up_act_max.dtype - down_act_dtype = down_act_max.dtype - gate_up_act_scale = gate_up_act_max / maxval.type(gate_up_act_dtype) - down_act_scale = down_act_max / maxval.type(down_act_dtype) - - gate_proj, up_proj = sub_layer.gate_up_proj.chunk(2, dim=-1) - abs_inputs = torch.abs(gate_proj) - batch_size = abs_inputs.shape[0] - abs_inputs_flat = abs_inputs.view(batch_size, -1) - gate_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) - - abs_inputs = torch.abs(up_proj) - batch_size = abs_inputs.shape[0] - abs_inputs_flat = abs_inputs.view(batch_size, -1) - up_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) - - abs_inputs = torch.abs(sub_layer.down_proj) - batch_size = abs_inputs.shape[0] - abs_inputs_flat = abs_inputs.view(batch_size, -1) - down_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) - - gate_weight_dtype = gate_proj.dtype - up_weight_dtype = up_proj.dtype - down_weight_dtype = sub_layer.down_proj.dtype - gate_weight_scale = gate_weight_max / maxval.type(gate_weight_dtype) - up_weight_scale = up_weight_max / maxval.type(up_weight_dtype) - down_weight_scale = down_weight_max / maxval.type(down_weight_dtype) + parent_layer, sub_name = find_parent_layer_and_sub_name( + quant_convert_module, name + ) + moe_qdq_module = self.quant_model.get_moe_qdq_module(sub_layer, name) + if moe_qdq_module is not sub_layer: + setattr(parent_layer, sub_name, moe_qdq_module) - q_linear = MyQDQModule( - gate_proj=gate_proj.cpu(), - up_proj=up_proj.cpu(), - down_proj=sub_layer.down_proj.cpu(), - gate_proj_weight_scale=gate_weight_scale.cpu(), - up_proj_weight_scale=up_weight_scale.cpu(), - down_proj_weight_scale=down_weight_scale.cpu(), - gate_up_proj_input_scale=gate_up_act_scale.cpu(), - down_proj_input_scale=down_act_scale.cpu(), - ) - setattr(parent_layer, sub_name, q_linear) self.quant_model.quantized = True def __getattr__(self, item): return super().__getattr__(item) - - -class MyQDQModule(torch.nn.Module): - def __init__( - self, - gate_proj: torch.nn.Parameter, - up_proj: torch.nn.Parameter, - down_proj: torch.nn.Parameter, - gate_proj_weight_scale: torch.nn.Parameter, - up_proj_weight_scale: torch.nn.Parameter, - down_proj_weight_scale: torch.nn.Parameter, - gate_up_proj_input_scale: torch.nn.Parameter, - down_proj_input_scale: torch.nn.Parameter, - ): - super().__init__() - quant_gate_weight, _ = quantize_weight_per_tensor_fp8( - gate_proj, gate_proj_weight_scale - ) - quant_up_weight, _ = quantize_weight_per_tensor_fp8( - up_proj, up_proj_weight_scale - ) - quant_down_weight, _ = quantize_weight_per_tensor_fp8( - down_proj, down_proj_weight_scale - ) - quant_gate_up_weight = torch.cat([quant_gate_weight, quant_up_weight], dim=-1) - - self.gate_up_proj = torch.nn.Parameter( - quant_gate_up_weight, requires_grad=False - ) - self.down_proj = torch.nn.Parameter(quant_down_weight, requires_grad=False) - - gate_proj_weight_scale = ( - gate_proj_weight_scale.view(-1) - if gate_proj_weight_scale.ndim == 0 - else gate_proj_weight_scale - ) - up_proj_weight_scale = ( - up_proj_weight_scale.view(-1) - if up_proj_weight_scale.ndim == 0 - else up_proj_weight_scale - ) - down_proj_weight_scale = ( - down_proj_weight_scale.view(-1) - if down_proj_weight_scale.ndim == 0 - else down_proj_weight_scale - ) - gate_up_proj_weight_scale = torch.cat( - [gate_proj_weight_scale, up_proj_weight_scale], dim=-1 - ) - - self.gate_up_proj_weight_scale = torch.nn.Parameter( - gate_up_proj_weight_scale, requires_grad=False - ) - self.down_proj_weight_scale = torch.nn.Parameter( - down_proj_weight_scale, requires_grad=False - ) - - down_proj_input_scale = ( - down_proj_input_scale.view(-1) - if down_proj_input_scale.ndim == 0 - else down_proj_input_scale.squeeze() - ) - gate_up_proj_input_scale = ( - gate_up_proj_input_scale.view(-1) - if gate_up_proj_input_scale.ndim == 0 - else gate_up_proj_input_scale.squeeze() - ) - - self.gate_up_proj_input_scale = torch.nn.Parameter( - gate_up_proj_input_scale, requires_grad=False - ) - self.down_proj_input_scale = torch.nn.Parameter( - down_proj_input_scale, requires_grad=False - ) - - def forward(self, x): - pass diff --git a/angelslim/models/base_model.py b/angelslim/models/base_model.py index 17e9316c..99e8c398 100644 --- a/angelslim/models/base_model.py +++ b/angelslim/models/base_model.py @@ -147,6 +147,9 @@ def get_qdq_module(self, sub_layer, name): raise NotImplementedError return q_linear + def get_moe_qdq_module(self, sub_layer, name): + return sub_layer + def get_nvfp4_qdq_module(self, sub_layer, name): act_scale, weight_scale, weight_scale_2 = None, None, None block_size = self.quant_config.quant_algo_info["block_size"] diff --git a/angelslim/models/vlm/qwen3_vl_moe.py b/angelslim/models/vlm/qwen3_vl_moe.py index 5c28f532..c4302e45 100644 --- a/angelslim/models/vlm/qwen3_vl_moe.py +++ b/angelslim/models/vlm/qwen3_vl_moe.py @@ -25,15 +25,17 @@ ) from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts -from angelslim.compressor.quant.observers.base_observer import BaseObserver +from angelslim.compressor.quant.core.quant_func import get_fp_maxval +from angelslim.compressor.quant.observers import MoEAbsmaxPertensorObserver from ...compressor.quant.core import LossFilter, PTQVLMSaveVllmHF +from ...compressor.quant.modules import MoEQDQModule from ...utils import find_layers, print_info from ..base_model import BaseLLMModel from ..model_factory import SlimModelFactory -def observer_forward( +def moe_observer_forward( self, hidden_states: torch.Tensor, routing_weights: torch.Tensor, @@ -108,9 +110,6 @@ def observer_forward( return next_states -Qwen3VLMoeTextExperts.forward = observer_forward - - @SlimModelFactory.register class Qwen3VLMoE(BaseLLMModel): def __init__( @@ -142,7 +141,6 @@ def from_pretrained( low_cpu_mem_usage=True, use_cache=False, using_multi_nodes=False, - compress_config=None, ): self.model = Qwen3VLMoeForConditionalGeneration.from_pretrained( model_path, @@ -162,6 +160,14 @@ def from_pretrained( model_path, trust_remote_code=trust_remote_code ) + def init_ptq(self, slim_config): + for _, module in self.model.named_modules(): + if isinstance(module, Qwen3VLMoeTextExperts): + module.forward = moe_observer_forward.__get__( + module, Qwen3VLMoeTextExperts + ) + super().init_ptq(slim_config) + def get_observer_layers(self): names = [ "self_attn.k_proj", @@ -189,12 +195,12 @@ def get_observer_layers(self): if result == "mlp.experts": if not hasattr(module, "gateupobservers"): layername = name + ".gate_up" - module.gateupobservers = MyAbsmaxPertensorObserver( + module.gateupobservers = MoEAbsmaxPertensorObserver( layer_name=layername ) if not hasattr(module, "downobservers"): layername = name + ".down" - module.downobservers = MyAbsmaxPertensorObserver( + module.downobservers = MoEAbsmaxPertensorObserver( layer_name=layername ) else: @@ -210,6 +216,52 @@ def get_observer_layers(self): observer_layers_dict.pop(default_name) return observer_layers_dict + def get_moe_qdq_module(self, sub_layer, name): + if not isinstance(sub_layer, Qwen3VLMoeTextExperts): + return sub_layer + maxval = get_fp_maxval(bits=8) + gate_up_act_max = sub_layer.gateupobservers.scales() + down_act_max = sub_layer.downobservers.scales() + gate_up_act_dtype = gate_up_act_max.dtype + down_act_dtype = down_act_max.dtype + gate_up_act_scale = gate_up_act_max / maxval.type(gate_up_act_dtype) + down_act_scale = down_act_max / maxval.type(down_act_dtype) + + gate_proj, up_proj = sub_layer.gate_up_proj.chunk(2, dim=-1) + abs_inputs = torch.abs(gate_proj) + batch_size = abs_inputs.shape[0] + abs_inputs_flat = abs_inputs.view(batch_size, -1) + gate_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) + + abs_inputs = torch.abs(up_proj) + batch_size = abs_inputs.shape[0] + abs_inputs_flat = abs_inputs.view(batch_size, -1) + up_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) + + abs_inputs = torch.abs(sub_layer.down_proj) + batch_size = abs_inputs.shape[0] + abs_inputs_flat = abs_inputs.view(batch_size, -1) + down_weight_max, _ = torch.max(abs_inputs_flat, dim=1, keepdim=True) + + gate_weight_dtype = gate_proj.dtype + up_weight_dtype = up_proj.dtype + down_weight_dtype = sub_layer.down_proj.dtype + gate_weight_scale = gate_weight_max / maxval.type(gate_weight_dtype) + up_weight_scale = up_weight_max / maxval.type(up_weight_dtype) + down_weight_scale = down_weight_max / maxval.type(down_weight_dtype) + + q_linear = MoEQDQModule( + gate_proj=gate_proj.cpu(), + up_proj=up_proj.cpu(), + down_proj=sub_layer.down_proj.cpu(), + gate_proj_weight_scale=gate_weight_scale.cpu(), + up_proj_weight_scale=up_weight_scale.cpu(), + down_proj_weight_scale=down_weight_scale.cpu(), + gate_up_proj_input_scale=gate_up_act_scale.cpu(), + down_proj_input_scale=down_act_scale.cpu(), + ) + return q_linear + def model_forward(self, dataloader, **kwargs): self.model.use_cache = False @@ -297,91 +349,3 @@ def get_save_func(self): raise NotImplementedError( f"deploy_backend {self.deploy_backend} is not supported for saving." ) - - -class MyAbsmaxPertensorObserver(BaseObserver): - def __init__(self, layer_name=None, quant_bits=8, **kwargs): - super(MyAbsmaxPertensorObserver, self).__init__(quant_bits=quant_bits) - self.layer_name = layer_name - self._scale = None - self._zero_point = None - self._min = None - self._max = torch.tensor(1e-7, dtype=torch.float32) - self.step = 0 - self.dtype = None - self.parent_observer = ( - kwargs["parent_observer"] - if kwargs and "parent_observer" in kwargs - else None - ) - - def forward(self, inputs): - """Calculate forward pass.""" - self.step += 1 - if not self.dtype: - self.dtype = inputs.dtype - if inputs.numel() > 0: - self._min, self._max = self._cal_min_max(inputs) - if self.parent_observer is not None: - self.parent_observer.update(self._min, self._max, self.step) - else: - assert self.parent_observer is not None - self._update_min_max(self.parent_observer.min, self.parent_observer.max) - return inputs - - def _cal_min_max(self, inputs): - if inputs.dim() >= 2: - abs_inputs = torch.abs(inputs) - batch_size = abs_inputs.shape[0] - abs_inputs_flat = abs_inputs.view( - batch_size, -1 - ) # [batch_size, seq_len * hidden_dim] - abs_max_val, _ = torch.max( - abs_inputs_flat, dim=1, keepdim=True - ) # [batch_size, 1] - min_threshold = self._max.to(abs_max_val.device).expand_as(abs_max_val) - abs_max_val = torch.maximum(abs_max_val, min_threshold) - else: - abs_max_val = torch.max(torch.abs(inputs)) - if abs_max_val.data < self._max.data: - abs_max_val = self._max - abs_max_val = abs_max_val.unsqueeze(0).unsqueeze(0) # [1, 1] - return 0, abs_max_val.to(inputs.device) - - def _update_min_max(self, min, max): - if min is not None and max is not None: - if self._min is None or min < self._min: - self._min = min - if self._max is None or max > self._max: - self._max = max - - def cal_thresholds(self): - """Compute thresholds for MAX function.""" - if self._scale is None: - self._scale = self._max - self._zero_point = torch.zeros_like(self._scale) - - def quant_axis(self): - """Return quantization axis.""" - return -1 - - def scales(self): - """Return output scales.""" - if self.step == 0 and self.parent_observer is not None: - self._update_min_max(self.parent_observer.min, self.parent_observer.max) - self.step = self.parent_observer.step - if self.step == 0: - raise ValueError( - "AbsmaxPertensorObserver scales must calibrate data first!" - ) - if self._scale is None: - self.cal_thresholds() - if self.dtype: - self._scale = self._scale.type(self.dtype) - return self._scale - - def zero_points(self): - """Return output zero points.""" - if self._zero_point is None: - self.cal_thresholds() - return self._zero_point From fd9430fac58ef76f8ce6f069a6881afe45a557db Mon Sep 17 00:00:00 2001 From: root Date: Tue, 13 Jan 2026 21:09:40 +0800 Subject: [PATCH 3/4] fix fp8 quant yaml name --- ...35b_a3b_fp8_static.yaml => qwen3_vl-235b_a22b_fp8_static.yaml} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename configs/qwen3_vl/fp8_static/{qwen3_vl-235b_a3b_fp8_static.yaml => qwen3_vl-235b_a22b_fp8_static.yaml} (100%) diff --git a/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml b/configs/qwen3_vl/fp8_static/qwen3_vl-235b_a22b_fp8_static.yaml similarity index 100% rename from configs/qwen3_vl/fp8_static/qwen3_vl-235b_a3b_fp8_static.yaml rename to configs/qwen3_vl/fp8_static/qwen3_vl-235b_a22b_fp8_static.yaml From 8fa9b7b4345f6b74905da300e80a5a201ff522a8 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 13 Jan 2026 21:41:06 +0800 Subject: [PATCH 4/4] update insert moe qdq module --- angelslim/compressor/quant/ptq.py | 19 ++++++++++++------- angelslim/models/vlm/qwen3_vl_moe.py | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/angelslim/compressor/quant/ptq.py b/angelslim/compressor/quant/ptq.py index 244e98f3..281c4bfe 100644 --- a/angelslim/compressor/quant/ptq.py +++ b/angelslim/compressor/quant/ptq.py @@ -18,6 +18,7 @@ import torch from safetensors.torch import load_file +from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextExperts from ...utils import find_parent_layer_and_sub_name, print_info from ..compressor_factory import CompressorFactory @@ -286,13 +287,17 @@ def _convert(self): setattr(parent_layer, sub_name, qdq_module) # 3. insert moe qdq module - for name, sub_layer in self.quant_model.model.named_modules(): - parent_layer, sub_name = find_parent_layer_and_sub_name( - quant_convert_module, name - ) - moe_qdq_module = self.quant_model.get_moe_qdq_module(sub_layer, name) - if moe_qdq_module is not sub_layer: - setattr(parent_layer, sub_name, moe_qdq_module) + # For qwen3_vl_moe models, we need to insert MoEQDQModule for MOE experts, + # since these modules contain gate_up_proj and down_proj, which are defined as + # nn.Parameters, not nn.Linear. + if Qwen3VLMoeTextExperts in self.quant_model.observer_layer_classes: + for name, sub_layer in self.quant_model.model.named_modules(): + parent_layer, sub_name = find_parent_layer_and_sub_name( + quant_convert_module, name + ) + moe_qdq_module = self.quant_model.get_moe_qdq_module(sub_layer, name) + if moe_qdq_module is not sub_layer: + setattr(parent_layer, sub_name, moe_qdq_module) self.quant_model.quantized = True diff --git a/angelslim/models/vlm/qwen3_vl_moe.py b/angelslim/models/vlm/qwen3_vl_moe.py index c4302e45..470e3430 100644 --- a/angelslim/models/vlm/qwen3_vl_moe.py +++ b/angelslim/models/vlm/qwen3_vl_moe.py @@ -56,7 +56,7 @@ def moe_observer_forward( Returns: torch.Tensor """ - # replace Qwen3VLMoeTextExperts forward function by observer_forward" + # replace Qwen3VLMoeTextExperts forward function by moe_observer_forward" batch_size = hidden_states.shape[0] hidden_states = hidden_states.reshape( -1, self.hidden_size