support qwen3.5 on volta (#4405)

grimoire · web-flow · commit f1e1a05cdaf9 · 2026-03-19T14:48:26.000+08:00
* support qwen3.5 on volta

* fix copilot comment

* fix float32

* update kernel
diff --git a/lmdeploy/pytorch/check_env/model.py b/lmdeploy/pytorch/check_env/model.py
@@ -52,7 +52,10 @@ def check_dtype(self, config):
 
             from lmdeploy.pytorch.config import ModelConfig
             from lmdeploy.utils import is_bf16_supported
-            model_config = ModelConfig.from_hf_config(config, model_path=model_path, dtype=dtype)
+            model_config = ModelConfig.from_hf_config(config,
+                                                      model_path=model_path,
+                                                      dtype=dtype,
+                                                      device_type=device_type)
             if model_config.dtype == torch.bfloat16:
                 if not is_bf16_supported(device_type):
                     logger.warning('Device does not support bfloat16.')
diff --git a/lmdeploy/pytorch/config.py b/lmdeploy/pytorch/config.py
@@ -8,18 +8,19 @@
 from lmdeploy.messages import PytorchEngineConfig
 from lmdeploy.pytorch.disagg.config import EngineRole, MigrationBackend
 from lmdeploy.pytorch.utils import maybe_register_config_serialize_by_value
-from lmdeploy.utils import get_logger
+from lmdeploy.utils import get_logger, is_bf16_supported
 
 logger = get_logger('lmdeploy')
 
 
-def _update_torch_dtype(config: 'ModelConfig', dtype: str):
+def _update_torch_dtype(config: 'ModelConfig', dtype: str, device_type: str = 'auto'):
     """Update the torch dtype from the model config.
 
     Args:
         config (ModelConfig): The input model config.
         dtype (str): user specified data type. Refer to
             `PyTorchEngineConfig.dtype` for detailed info
+        device_type (str): The device type. Refer to `PyTorchEngineConfig.device_type` for detailed info
     """
     quantization_config = getattr(config.hf_config, 'quantization_config', dict())
     quant_method = quantization_config.get('quant_method', None)
@@ -48,6 +49,8 @@ def _update_torch_dtype(config: 'ModelConfig', dtype: str):
         # update hf_config as well
         setattr(config.hf_config, 'torch_dtype', torch_dtype)
     else:
+        if torch_dtype == 'bfloat16' and not is_bf16_supported(device_type):
+            torch_dtype = 'float16'
         # change to user specified data type if it is not 'auto'
         if dtype == 'auto':
             torch_dtype = torch_dtype if torch_dtype in ['float16', 'bfloat16'] else 'float16'
@@ -356,6 +359,7 @@ def from_pretrained(
         is_draft_model: bool = False,
         spec_method: str = None,
         model_format: str = None,
+        device_type: str = 'auto',
     ):
         """Instantiate one of the configuration classes of the library from a
         pretrained model configuration.
@@ -386,6 +390,7 @@ def from_pretrained(
             dist_config=dist_config,
             is_draft_model=is_draft_model,
             spec_method=spec_method,
+            device_type=device_type,
         )
         fp32_lm_head = False
         if hf_overrides is not None:
@@ -413,6 +418,7 @@ def from_hf_config(
         dist_config: DistConfig = None,
         is_draft_model: bool = False,
         spec_method: str = None,
+        device_type: str = 'auto',
     ):
         """From huggingface config."""
         from lmdeploy.pytorch.configurations import AutoModelConfigBuilder
@@ -441,7 +447,7 @@ def from_hf_config(
             assert tp % model_config.num_key_value_heads == 0
 
         # should after setting `hf_config` and `model_arch` attributes
-        model_config = _update_torch_dtype(model_config, dtype)
+        model_config = _update_torch_dtype(model_config, dtype, device_type=device_type)
 
         # update eos_token_id to list
         if isinstance(model_config.eos_token_id, int):
diff --git a/lmdeploy/pytorch/configurations/qwen3_5.py b/lmdeploy/pytorch/configurations/qwen3_5.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 
+from lmdeploy.utils import is_bf16_supported
+
 from .builder import AutoModelConfigBuilder
 from .default import DefaultModelConfigBuilder
 from .qwen3_next import _check_env_qwen3_next
@@ -42,7 +44,10 @@ def build(cls, hf_config, model_path: str = None, tp: int = 1, **kwargs):
 
         conv_state_shape = (num_delta_layers, conv_dim, conv_kernel_size)
         recurrent_state_shape = (num_delta_layers, num_v_heads, head_k_dim, head_v_dim)
-        dtype = torch.bfloat16
+        if is_bf16_supported():
+            dtype = torch.bfloat16
+        else:
+            dtype = torch.float16
         cfg.states_shapes = [(conv_state_shape, dtype), (recurrent_state_shape, dtype)]
         cfg.check_env_func = _check_env_qwen3_next
         return cfg
diff --git a/lmdeploy/pytorch/engine/executor/__init__.py b/lmdeploy/pytorch/engine/executor/__init__.py
@@ -79,6 +79,7 @@ def build_executor(
         is_draft_model=False,
         spec_method=None if specdecode_config is None else specdecode_config.method,
         model_format=misc_config.model_format,
+        device_type=device_type,
     )
 
     if distributed_executor_backend is None:
diff --git a/lmdeploy/pytorch/kernels/cuda/fused_moe.py b/lmdeploy/pytorch/kernels/cuda/fused_moe.py
@@ -27,6 +27,7 @@ def get_cuda_autotune_config():
         },
                       num_stages=4,
                       num_warps=4),
+        # SM8
         triton.Config({
             'BLOCK_SIZE_M': 128,
             'BLOCK_SIZE_N': 128,
@@ -51,18 +52,46 @@ def get_cuda_autotune_config():
         },
                       num_stages=4,
                       num_warps=4),
+        # SM7-
+        triton.Config({
+            'BLOCK_SIZE_M': 64,
+            'BLOCK_SIZE_N': 128,
+            'BLOCK_SIZE_K': 32,
+            'GROUP_SIZE_M': 1,
+        },
+                      num_stages=4,
+                      num_warps=4),
+        triton.Config({
+            'BLOCK_SIZE_M': 128,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 32,
+            'GROUP_SIZE_M': 1,
+        },
+                      num_stages=4,
+                      num_warps=4),
+        triton.Config({
+            'BLOCK_SIZE_M': 64,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 32,
+            'GROUP_SIZE_M': 1,
+        },
+                      num_stages=5,
+                      num_warps=2),
     ]
 
 
-def _config_prune_func(config: dict, *args, **kwargs):
+def _config_prune_func(config: list, *args, **kwargs):
     """Fused moe config prune."""
     device_cap = torch.cuda.get_device_capability()
     num_sm9x = 2
+    cum_num_sm8x = 5
 
     if device_cap[0] >= 9:
         return config[:num_sm9x]
+    elif device_cap[0] >= 8:
+        return config[num_sm9x:cum_num_sm8x]
     else:
-        return config[num_sm9x:]
+        return config[cum_num_sm8x:]
 
 
 @triton.autotune(
diff --git a/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py b/lmdeploy/pytorch/kernels/cuda/gated_delta_rule.py

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,7 @@ def build_executor(`
`79`	`79`	`is_draft_model=False,`
`80`	`80`	`spec_method=None if specdecode_config is None else specdecode_config.method,`
`81`	`81`	`model_format=misc_config.model_format,`
	`82`	`+ device_type=device_type,`
`82`	`83`	`)`
`83`	`84`
`84`	`85`	`if distributed_executor_backend is None:`