fix models for transformers>=5 (#4381)

grimoire · web-flow · commit 20ae545110fa · 2026-03-02T11:42:25.000+08:00
* fix models for transformers&gt;=5

* remove qwen2_vl config
diff --git a/lmdeploy/pytorch/configurations/chatglm.py b/lmdeploy/pytorch/configurations/chatglm.py
@@ -15,7 +15,7 @@ def condition(cls, hf_config):
     def build(cls, hf_config, model_path: str = None, **kwargs):
         """build."""
         head_dim = hf_config.hidden_size // hf_config.num_attention_heads
-        bos_token_id = hf_config.bos_token_id
+        bos_token_id = getattr(hf_config, 'bos_token_id', None)
         if bos_token_id is None:
             bos_token_id = hf_config.pad_token_id
 
diff --git a/lmdeploy/pytorch/configurations/deepseek_v2.py b/lmdeploy/pytorch/configurations/deepseek_v2.py
@@ -41,12 +41,13 @@ def build(cls, hf_config, model_path: str = None, is_draft_model: bool = False,
         if is_draft_model or spec_method is not None:
             model_paradigm = 'ar_spec'
 
+        bos_token_id = getattr(hf_config, 'bos_token_id', None)
         config = ModelConfig(
             hidden_size=hf_config.hidden_size,
             num_layers=num_layers,
             num_attention_heads=num_attention_heads,
             num_key_value_heads=num_key_value_heads,
-            bos_token_id=hf_config.bos_token_id,
+            bos_token_id=bos_token_id,
             eos_token_id=hf_config.eos_token_id,
             head_dim=head_dim,
             k_head_dim=k_head_dim,
diff --git a/lmdeploy/pytorch/configurations/qwen3_vl.py b/lmdeploy/pytorch/configurations/qwen3_vl.py
@@ -8,11 +8,15 @@ class Qwen3VLModelConfigBuilder(AutoModelConfigBuilder):
     @classmethod
     def condition(cls, hf_config):
         """config."""
-        return hf_config.model_type in ['qwen3_vl', 'qwen3_vl_moe']
+        return hf_config.model_type in ['qwen2_vl', 'qwen2_5_vl', 'qwen3_vl', 'qwen3_vl_moe']
 
     @classmethod
     def build(cls, hf_config, model_path: str = None, **kwargs):
         """build."""
+        if not hasattr(hf_config, 'text_config'):
+            # for transformers <= 5
+            return DefaultModelConfigBuilder.build(hf_config, model_path, **kwargs)
+
         if hasattr(hf_config, 'quantization_config') and not hasattr(hf_config.text_config, 'quantization_config'):
             setattr(hf_config.text_config, 'quantization_config', hf_config.quantization_config)
         cfg = DefaultModelConfigBuilder.build(hf_config.text_config, model_path, **kwargs)
diff --git a/lmdeploy/pytorch/models/gemma.py b/lmdeploy/pytorch/models/gemma.py
@@ -219,7 +219,7 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
             is_tp=True,
         )
 
-        hidden_activation = config.hidden_activation
+        hidden_activation = getattr(config, 'hidden_activation', None)
         if hidden_activation is None:
             hidden_activation = 'gelu_pytorch_tanh'
             assert hidden_activation == 'gelu_pytorch_tanh'
@@ -381,16 +381,47 @@ def __init__(self, config: PretrainedConfig, dtype: torch.dtype = None, device:
         self.norm = RMSNorm(config.hidden_size, config.rms_norm_eps, dtype=dtype, device=device)
 
         # build rotary embedding
-        self.rotary_emb = build_rotary_embedding_from_config(config)
+        self.build_rope_emb(config)
 
-        if self.model_type == 'gemma3_text':
-            rope_dim = config.head_dim
-            rope_max_pos_emb = config.max_position_embeddings
+    def build_rope_emb(self, config: PretrainedConfig):
+        rope_dim = config.head_dim
+        rope_max_pos_emb = config.max_position_embeddings
+
+        if self.model_type != 'gemma3_text':
+            self.rotary_emb = build_rotary_embedding_from_config(config)
+            return
+
+        # for gemma3
+        if hasattr(config, 'rope_local_base_freq'):
             rope_base = config.rope_local_base_freq
+            self.rotary_emb = build_rotary_embedding_from_config(config)
+
+            if self.model_type == 'gemma3_text':
+                self.rotary_emb_local = build_rotary_embedding(
+                    rope_dim,
+                    rope_max_pos_emb,
+                    rope_base,
+                    emb_type=RopeType.Default,
+                )
+        else:
+            # for transformers>=5
+            rope_dim = config.head_dim
+            from lmdeploy.pytorch.nn.rotary_embedding import get_rope_parameters
+            rope_parameters = get_rope_parameters(config)
+            full_attention = rope_parameters['full_attention']
+            sliding_attention = rope_parameters['sliding_attention']
+            # note that emb type has been fixed.
+            self.rotary_emb = build_rotary_embedding(
+                rope_dim,
+                rope_max_pos_emb,
+                base=full_attention['rope_theta'],
+                scaling_factor=full_attention['factor'],
+                emb_type=RopeType.LinearScaling,
+            )
             self.rotary_emb_local = build_rotary_embedding(
                 rope_dim,
                 rope_max_pos_emb,
-                rope_base,
+                base=sliding_attention['rope_theta'],
                 emb_type=RopeType.Default,
             )
 
diff --git a/lmdeploy/pytorch/models/llama4.py b/lmdeploy/pytorch/models/llama4.py
@@ -811,7 +811,6 @@ def __init__(self,
         self._update_quant_config(config)
         self.language_model = Llama4ForCausalLM(config.text_config, ctx_mgr, dtype=dtype, device=device)
         self.vocab_size = config.text_config.vocab_size
-        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
 
         self.input_processor = Llama4InputProcessor(config, dtype)
 
diff --git a/lmdeploy/pytorch/models/qwen2_5_vl.py b/lmdeploy/pytorch/models/qwen2_5_vl.py
@@ -406,11 +406,13 @@ def __init__(self,
             dtype=dtype,
             device=device,
         )
+        # get text_config
+        text_config = getattr(config, 'text_config', config)
         # build model
-        self.model = Qwen2Model(config, dtype=dtype, device=device)
+        self.model = Qwen2Model(text_config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
+        self.lm_head = build_rowwise_linear(text_config.hidden_size,
+                                            text_config.vocab_size,
                                             bias=False,
                                             dtype=dtype,
                                             device=device)
diff --git a/lmdeploy/pytorch/models/qwen2_vl.py b/lmdeploy/pytorch/models/qwen2_vl.py
@@ -625,11 +625,13 @@ def __init__(self,
             dtype=dtype,
             device=device,
         )
+        # get text_config
+        text_config = getattr(config, 'text_config', config)
         # build model
-        self.model = Qwen2Model(config, dtype=dtype, device=device)
+        self.model = Qwen2Model(text_config, dtype=dtype, device=device)
         # build lm_head
-        self.lm_head = build_rowwise_linear(config.hidden_size,
-                                            config.vocab_size,
+        self.lm_head = build_rowwise_linear(text_config.hidden_size,
+                                            text_config.vocab_size,
                                             bias=False,
                                             dtype=dtype,
                                             device=device)
diff --git a/lmdeploy/vl/model/cogvlm.py b/lmdeploy/vl/model/cogvlm.py
@@ -70,7 +70,7 @@ def proc_messages(messages, chat_template, sequence_start):
             prompt_messages.append(dict(role='user', content=content[0], num_images=n_images))
 
         from lmdeploy.model import Vicuna
-        llm_chat_template = Vicuna(eoa=chat_template.eoa, stop_words=chat_template.stop_words)
+        llm_chat_template = Vicuna(eoa='</s>', stop_words=chat_template.stop_words)
         prompt = ''
         IMAGE_TOKEN = '<IMAGE_TOKEN>'
         for i, msg in enumerate(prompt_messages):