Compatible with transformers 5.0 at TurboMind side (#4304)

lvhan028 · web-flow · commit 456aca06e517 · 2026-03-01T13:49:02.000+08:00
* Compatible with transformers 5.0

* no constraint on transformers

* qwen2.5 vl

* fix internvl

* fix internlm

* minor fix qwen2-vl

* improve type hint
diff --git a/lmdeploy/turbomind/deploy/parameter.py b/lmdeploy/turbomind/deploy/parameter.py
@@ -1,6 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from abc import abstractmethod
-from typing import List
 
 import torch
 
@@ -23,7 +22,7 @@ def to_fp8(x: torch.Tensor):
 
 
 def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
-    assert x.dtype == torch.uint8
+    assert x.dtype == torch.uint8, f'x.dtype: {x.dtype}'
     xs = x.view(*x.shape[:-1], -1, 8).split(1, dim=-1)
     a = torch.zeros(xs[0].shape, dtype=torch.int32, device=x.device)
     for t in reversed(xs):
@@ -45,7 +44,7 @@ class Parameter:
     KEY = ()
 
     @classmethod
-    def take(cls, keys: List[str]):
+    def take(cls, keys: list[str]):
         if not any(k.endswith(cls.KEYS[0]) for k in keys):
             return False
         xs = []
@@ -126,7 +125,7 @@ def __call__(self, f, g, i):
         f(i, g('Plora_B.weight'), 'lora_b.weight', identity)
 
 
-def get_params(keys: List[str], bias=0):
+def get_params(keys: list[str], bias=0):
     ps = []
     if PLora.take(keys):
         ps.append(PLora())
diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek2.py b/lmdeploy/turbomind/deploy/source_model/deepseek2.py
@@ -146,7 +146,11 @@ def model_info(self):
             info['router_n_groups'] = cfg['router_n_groups']
         rope_param: RopeParam = info['rope_param']
         rope_param.dim = qk_rope_dim
-        rope_scaling = cfg.get('rope_scaling')
+        if 'rope_parameters' in cfg:
+            # transformers v5.0.0 aggregates all rope-related parameters into 'rope_parameters'
+            rope_scaling = cfg['rope_parameters']
+        else:
+            rope_scaling = cfg.get('rope_scaling')
         if rope_scaling and rope_scaling.get('type') == 'yarn':
             attention_factor, yarn_scale = get_yarn_params(rope_scaling)
             yarn_scale *= q_head_dim**(-0.5)
diff --git a/lmdeploy/turbomind/deploy/source_model/internvl.py b/lmdeploy/turbomind/deploy/source_model/internvl.py
@@ -15,10 +15,6 @@ class InternVLReader(LlamaReader):
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.lm_head.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
-        model_cfg = model_cfg.get('llm_config') or model_cfg.get('text_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
-
 
 # Note the subtle difference in keys
 class InternVL2Reader(InternLM2Reader):
@@ -30,10 +26,6 @@ class InternVL2Reader(InternLM2Reader):
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.output.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
-        model_cfg = model_cfg.get('llm_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
-
 
 class InternVL3d5Reader(Qwen3Reader):
     attn_layer_prefix = 'language_model.model.layers'
@@ -42,10 +34,6 @@ class InternVL3d5Reader(Qwen3Reader):
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.lm_head.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
-        model_cfg = model_cfg.get('llm_config') or model_cfg.get('text_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
-
 
 class InternVL3d5Qwen3MoEReader(Qwen3MoeReader):
     attn_layer_prefix = 'language_model.model.layers'
@@ -54,10 +42,6 @@ class InternVL3d5Qwen3MoEReader(Qwen3MoeReader):
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.lm_head.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
-        model_cfg = model_cfg.get('llm_config') or model_cfg.get('text_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
-
 
 class InternVL3d5GptOSSReader(GptOssReader):
     attn_layer_prefix = 'language_model.model.layers'
@@ -66,10 +50,6 @@ class InternVL3d5GptOSSReader(GptOssReader):
     norm_weight_key = 'language_model.model.norm.weight'
     output_weight_key = 'language_model.lm_head.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
-        model_cfg = model_cfg.get('llm_config') or model_cfg.get('text_config')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
-
 
 class InternS1Reader(Qwen3MoeReader):
     """InternS1Reader for internlm/InternS1 model."""
@@ -80,12 +60,6 @@ class InternS1Reader(Qwen3MoeReader):
     norm_weight_key = 'model.language_model.norm.weight'
     output_weight_key = 'lm_head.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
-        model_cfg = model_cfg.get('text_config')
-        if model_cfg is None:
-            raise ValueError(f'Miss "text_config" in model config: {model_cfg}')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
-
 
 class InternS1MiniReader(Qwen3Reader):
 
@@ -95,12 +69,6 @@ class InternS1MiniReader(Qwen3Reader):
     norm_weight_key = 'model.language_model.norm.weight'
     output_weight_key = 'lm_head.weight'
 
-    def __init__(self, new_params: dict, unused_params: dict, last_bin: bool, model_cfg: dict, **kwargs):
-        model_cfg = model_cfg.get('text_config')
-        if model_cfg is None:
-            raise ValueError(f'Miss "text_config" in model config: {model_cfg}')
-        super().__init__(new_params, unused_params, last_bin, model_cfg, **kwargs)
-
 
 @INPUT_MODELS.register_module(name='internvl')
 class InternVLModel(LlamaModel):
diff --git a/lmdeploy/turbomind/deploy/source_model/llama.py b/lmdeploy/turbomind/deploy/source_model/llama.py
@@ -132,8 +132,12 @@ class LlamaModel(BaseInputModel):
     def __init__(self, model_path: str, tokenizer_path: str, **kwargs: dict):
         super().__init__(model_path, tokenizer_path)
         self.policy = kwargs.get('input_policy')
-        _, self.model_config = get_model_arch(model_path)
-        self.model_config = self.model_config.to_dict()
+        _, model_config = get_model_arch(model_path)
+        if hasattr(model_config, 'text_config'):
+            model_config = model_config.text_config
+        elif hasattr(model_config, 'llm_config'):
+            model_config = model_config.llm_config
+        self.model_config = model_config.to_dict()
         self.fp8_quant = kwargs.get('fp8_quant', False)
 
     def readers(self):
@@ -171,27 +175,21 @@ def model_info(self):
         max_position_embeddings = int(model_arg.get('max_position_embeddings', 0))
         rope_param = RopeParam(type='default', base=rope_theta, dim=head_dim)
         if isinstance(rope_scaling, dict):
-            llama2_scaling_type = rope_scaling.get('type', '')
-            llama3_scaling_type = rope_scaling.get('rope_type', '')
-            if llama2_scaling_type and llama3_scaling_type \
-                    and llama2_scaling_type != llama3_scaling_type:
-                raise ValueError(f'Ambiguous rope_scaling in config: {model_arg}')
-            scaling_type = llama2_scaling_type if llama2_scaling_type \
-                else llama3_scaling_type
+            rope_type = rope_scaling.get('rope_type', '') or rope_scaling.get('type', '')
             if rope_scaling.get('mrope_section') is not None:
                 # TODO: treat mrope as an option to the common rope functions
-                scaling_type = 'mrope'
+                rope_type = 'mrope'
             scaling_factor = rope_scaling.get('factor', 0.0)
-            if scaling_type == 'default':
+            if rope_type == 'default':
                 pass
-            elif scaling_type == 'dynamic':
+            elif rope_type == 'dynamic':
                 rope_param.type = 'dynamic'
                 rope_param.factor = scaling_factor
                 rope_param.max_position_embeddings = max_position_embeddings
-            elif scaling_type == 'linear':
+            elif rope_type == 'linear':
                 rope_param.type = 'linear'
                 rope_param.factor = scaling_factor
-            elif scaling_type == 'llama3':
+            elif rope_type == 'llama3':
                 low_freq_factor = rope_scaling.get('low_freq_factor', 1.0)
                 high_freq_factor = rope_scaling.get('high_freq_factor', 1.0)
                 original_max_position_embeddings = rope_scaling.get('original_max_position_embeddings', 0)
@@ -200,7 +198,7 @@ def model_info(self):
                 rope_param.low_freq_factor = low_freq_factor
                 rope_param.high_freq_factor = high_freq_factor
                 rope_param.original_max_position_embeddings = original_max_position_embeddings
-            elif scaling_type == 'yarn':
+            elif rope_type == 'yarn':
                 attention_factor = rope_scaling.get('attention_factor', None)
                 if attention_factor is None:
                     attention_factor = 0.1 * math.log(scaling_factor) + 1.0
@@ -217,12 +215,12 @@ def model_info(self):
                 rope_param.attention_factor = attention_factor
                 rope_param.beta_fast = beta_fast
                 rope_param.beta_slow = beta_slow
-            elif scaling_type == 'mrope':
+            elif rope_type == 'mrope':
                 mrope_section = rope_scaling.get('mrope_section')
                 rope_param.type = 'mrope'
                 rope_param.mrope_section = mrope_section
             else:
-                raise RuntimeError(f'Unsupported rope type: {scaling_type}')
+                raise RuntimeError(f'Unsupported rope type: {rope_type}')
 
         return dict(size_per_head=head_dim,
                     num_layer=num_layer,
diff --git a/lmdeploy/vl/model/qwen2.py b/lmdeploy/vl/model/qwen2.py
@@ -1,6 +1,4 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, List, Tuple
-
 import torch
 
 from lmdeploy.vl.model.base import VISION_MODELS, VisionModel
@@ -35,7 +33,7 @@ def build_preprocessor(self):
         self.image_token = self.processor.image_token
         self.image_token_id = tokenizer.encode(self.image_token)[-1]
 
-    def preprocess(self, messages: List[Dict]) -> List[Dict]:
+    def preprocess(self, messages: list[dict]) -> list[dict]:
         """Refer to `super().preprocess()` for spec."""
         from qwen_vl_utils import process_vision_info
 
@@ -48,7 +46,7 @@ def preprocess(self, messages: List[Dict]) -> List[Dict]:
             item = dict(type='image', image=image)
             item.update({key: params[key] for key in params.keys() if key in optional_keys})
             image_inputs, _ = process_vision_info([dict(content=[item])])
-            result = self.processor.image_processor(images=image_inputs, videos=None, return_tensors='pt')
+            result = self.processor.image_processor(images=image_inputs, return_tensors='pt')
             merge_length = self.processor.image_processor.merge_size**2
             image_tokens = result['image_grid_thw'].prod(dim=1) // merge_length
             result.update(dict(image_size=image.size, image_tokens=image_tokens, image_token_id=self.image_token_id))
@@ -77,10 +75,7 @@ def build_model(self):
                 if hasattr(config, 'text_config'):
                     config.text_config.tie_word_embeddings = False
                 model = AutoModelCls._from_config(config)
-                if hasattr(AutoModelCls, 'visual'):
-                    # transformers >= 4.52.0 modified model structure
-                    # https://github.com/huggingface/transformers/blob/v4.52.0/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1791-L1800
-                    model.visual = model.model.visual
+                model.visual = model.model.visual
                 del model.model
                 del model.lm_head
                 model.half()
@@ -96,12 +91,12 @@ def build_model(self):
             self.model = model.eval()
 
     @torch.no_grad()
-    def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
+    def forward(self, messages: list[dict], max_batch_size: int = 1) -> list[dict]:
         """Extract image feature. ONLY implement it when the backend is
         turbomind engine.
 
         Args:
-            messages(List[Dict]): the outputs of `preprocess`
+            messages(list[dict]): the outputs of `preprocess`
             max_batch_size(int): the max batch size when forwarding vision
                 model
         Return:
@@ -117,6 +112,10 @@ def forward(self, messages: List[Dict], max_batch_size: int = 1) -> List[Dict]:
             pixel_values = torch.cat(pixel_values, dim=0).to(device)
             image_grid_thw = torch.cat(image_grid_thw, dim=0).to(device)
             image_embeds = self.model.visual(pixel_values, grid_thw=image_grid_thw)
+            if hasattr(image_embeds, 'pooler_output'):
+                # transformers >= 5.0.0, the type if image_embeds is `BaseModelOutputWithPooling`
+                # rather than torch.Tensor
+                image_embeds = image_embeds.pooler_output
             merge_length = self.processor.image_processor.merge_size**2
             split_size = image_grid_thw.prod(dim=1) // merge_length
             image_embeds = image_embeds.split(split_size.tolist())
@@ -162,8 +161,8 @@ def proc_messages(self, messages, chat_template, sequence_start, chat_template_k
 
     @staticmethod
     def get_mrope_info(seq_len: int,
-                       grid_thws: List[Tuple[int, int, int]] = None,
-                       ranges: List[Tuple[int, int]] = None):
+                       grid_thws: list[tuple[int, int, int]] = None,
+                       ranges: list[tuple[int, int]] = None):
         mrope_position_ids = [torch.arange(ranges[0][0]).expand(3, -1)]
         st_idx = ranges[0][0]
         for i, (grid_thw, embedding_range) in enumerate(zip(grid_thws, ranges)):
diff --git a/lmdeploy/vl/model/utils.py b/lmdeploy/vl/model/utils.py
@@ -1,66 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 
 import inspect
-import os
-import sys
 from contextlib import contextmanager
-from typing import Callable, Dict, Iterator, List, MutableSequence, Union
+from typing import Callable, MutableSequence
 
 import torch
-import torch.nn as nn
-from safetensors.torch import load_file
-from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
-from transformers.utils.hub import get_checkpoint_shard_files
-
-
-def load_weight_ckpt(ckpt: str) -> Dict[str, torch.Tensor]:
-    """Load checkpoint."""
-    if ckpt.endswith('.safetensors'):
-        return load_file(ckpt)
-    else:
-        return torch.load(ckpt, weights_only=True)
-
-
-def get_used_weight_files(folder: str, state_dict: Dict[str, torch.Tensor]) -> List[str]:
-    """Get used checkpoint which contains keys in state_dict."""
-    _index_file = os.path.join(folder, WEIGHTS_INDEX_NAME)
-    _safe_index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME)
-    if os.path.exists(_index_file):
-        index_file = _index_file
-    elif os.path.exists(_safe_index_file):
-        index_file = _safe_index_file
-    elif os.path.isfile(os.path.join(folder, SAFE_WEIGHTS_NAME)):  # Single safetensor file
-        return [SAFE_WEIGHTS_NAME]
-    elif os.path.isfile(os.path.join(folder, WEIGHTS_NAME)):
-        return [WEIGHTS_NAME]
-    else:
-        raise FileNotFoundError
-    _, sharded_metadata = get_checkpoint_shard_files(folder, index_file)
-    potential_keys = set(state_dict.keys())
-    supplied_keys = set(sharded_metadata['weight_map'].keys())
-    shared_keys = potential_keys & supplied_keys
-    valid_files = set(sharded_metadata['weight_map'][k] for k in shared_keys)
-    return valid_files
-
-
-def load_model_from_weight_files(model: nn.Module, folder: str) -> None:
-    """Load nn.Module weight from folder."""
-    valid_files = get_used_weight_files(folder, model.state_dict())
-    for file_name in valid_files:
-        ckpt = os.path.join(folder, file_name)
-        state_dict = load_weight_ckpt(ckpt)
-        model.load_state_dict(state_dict, strict=False)
-
-
-@contextmanager
-def add_sys_path(path: Union[str, os.PathLike]) -> Iterator[None]:
-    """Temporarily add the given path to `sys.path`."""
-    path = os.fspath(path)
-    try:
-        sys.path.insert(0, path)
-        yield
-    finally:
-        sys.path.remove(path)
 
 
 @contextmanager
@@ -82,27 +26,7 @@ def disable_logging():
     logging.disable(previous_level)
 
 
-@contextmanager
-def hack_import_with(src: List[str], dst: str = 'torch'):
-    """Replace wanted and uninstalled package with a dummy one.
-
-    Args:
-        src (List): a list of package name
-        dst (str): dummy package name. Default to 'torch'.
-    """
-    import sys
-    from importlib.util import find_spec
-    not_installed = []
-    for item in src:
-        if not find_spec(item):
-            not_installed.append(item)
-            sys.modules[item] = __import__(dst)
-    yield
-    for item in not_installed:
-        sys.modules.pop(item, None)
-
-
-def _set_func(origin_func_path: Union[str, None], rewrite_func: Callable, origin_func: Callable = None):
+def _set_func(origin_func_path: str | None, rewrite_func: Callable, origin_func: Callable = None):
     """Replace old function with the new function.
 
     Args:
@@ -148,7 +72,7 @@ def _set_func(origin_func_path: Union[str, None], rewrite_func: Callable, origin
 
 
 @contextmanager
-def rewrite_ctx(origin_func_path: List[Union[str, Callable]], rewrite_func: List[Callable]):
+def rewrite_ctx(origin_func_path: list[str | Callable], rewrite_func: list[Callable]):
     """Rewrite context."""
     assert len(origin_func_path) == len(rewrite_func)
     origin_func_list = []
diff --git a/requirements/runtime_cuda.txt b/requirements/runtime_cuda.txt
@@ -25,7 +25,7 @@ tiktoken
 tilelang
 torch<=2.10.0,>=2.0.0
 torchvision<=0.25.0,>=0.15.0
-transformers<5.0.0
+transformers>=4.52.0
 triton<=3.6.0,>=3.0.0; sys_platform == "linux" and "aarch64" not in platform_machine and "arm" not in platform_machine
 uvicorn
 xgrammar