|
| 1 | +# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai |
| 2 | +# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai |
| 3 | +# SPDX-License-Identifier: Apache-2.0 |
| 4 | +# Contact: qubitium@modelcloud.ai, x.com/qubitium |
| 5 | + |
| 6 | +import torch |
| 7 | +from torch import nn |
| 8 | + |
| 9 | +from ..moe_lifecycle import GateUpDownMoELifecycleHooks |
| 10 | +from .ovis2_5 import Ovis2_5QModel |
| 11 | + |
| 12 | + |
| 13 | +class Ovis2_6_MoeQModel(Ovis2_5QModel): |
| 14 | + dynamic_expert_index = "num_experts" |
| 15 | + |
| 16 | + pre_lm_head_norm_module = "llm.model.norm" |
| 17 | + rotary_embedding = "llm.model.rotary_emb" |
| 18 | + |
| 19 | + awq_scale_optimize_shape_dependent_modules = ["self_attn.o_proj"] |
| 20 | + |
| 21 | + defuser_module_paths = ("llm",) |
| 22 | + |
| 23 | + moe_lifecycle_hooks = GateUpDownMoELifecycleHooks() |
| 24 | + |
| 25 | + module_tree = [ |
| 26 | + "llm", |
| 27 | + "model", |
| 28 | + "layers", |
| 29 | + "#", |
| 30 | + { |
| 31 | + "input_layernorm": ("input_layernorm:!",), |
| 32 | + "self_attn": ("q_norm:!", "k_norm:!", "q_proj:0", "k_proj:0", "v_proj:0", "o_proj:1"), |
| 33 | + "post_attention_layernorm": ("post_attention_layernorm:!",), |
| 34 | + "mlp:moe:?": { |
| 35 | + "gate": ("gate:!",), |
| 36 | + "experts": { |
| 37 | + "#": ("gate_proj:0", "up_proj:0", "down_proj:1"), |
| 38 | + }, |
| 39 | + }, |
| 40 | + } |
| 41 | + ] |
| 42 | + |
| 43 | + @staticmethod |
| 44 | + def _materialize_layernorm_defaults(layernorm: nn.LayerNorm, device: torch.device) -> None: |
| 45 | + if layernorm.weight is not None and ( |
| 46 | + getattr(layernorm.weight, "is_meta", False) or layernorm.weight.device.type == "meta" |
| 47 | + ): |
| 48 | + layernorm.weight = nn.Parameter( |
| 49 | + torch.ones(layernorm.normalized_shape, device=device, dtype=layernorm.weight.dtype), |
| 50 | + requires_grad=layernorm.weight.requires_grad, |
| 51 | + ) |
| 52 | + |
| 53 | + if layernorm.bias is not None and ( |
| 54 | + getattr(layernorm.bias, "is_meta", False) or layernorm.bias.device.type == "meta" |
| 55 | + ): |
| 56 | + layernorm.bias = nn.Parameter( |
| 57 | + torch.zeros(layernorm.normalized_shape, device=device, dtype=layernorm.bias.dtype), |
| 58 | + requires_grad=layernorm.bias.requires_grad, |
| 59 | + ) |
| 60 | + |
| 61 | + def _materialize_missing_vision_post_layernorm(self, device: torch.device) -> None: |
| 62 | + post_layernorm = getattr( |
| 63 | + getattr(getattr(self.model.visual_tokenizer, "vit", None), "vision_model", None), |
| 64 | + "post_layernorm", |
| 65 | + None, |
| 66 | + ) |
| 67 | + if isinstance(post_layernorm, nn.LayerNorm): |
| 68 | + self._materialize_layernorm_defaults(post_layernorm, device) |
| 69 | + |
| 70 | + def pre_quantize_generate_hook_start(self): |
| 71 | + # Ovis 2.6 checkpoints omit SigLIP2 post_layernorm tensors even though |
| 72 | + # the remote code constructs the LayerNorm. Keep its default init instead of |
| 73 | + # resolving nonexistent checkpoint keys. |
| 74 | + self._materialize_missing_vision_post_layernorm(torch.device(self.quantize_config.device)) |
| 75 | + super().pre_quantize_generate_hook_start() |
0 commit comments