From 6c473b4f90fc40f52ab7a786e2c6dfc5175fc013 Mon Sep 17 00:00:00 2001 From: irexyc Date: Mon, 8 Jun 2026 07:44:56 +0000 Subject: [PATCH 1/6] fix dequant_mixed --- lmdeploy/turbomind/converter.py | 2 ++ lmdeploy/turbomind/weight_format.py | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/lmdeploy/turbomind/converter.py b/lmdeploy/turbomind/converter.py index e8e592c512..6936b80af2 100644 --- a/lmdeploy/turbomind/converter.py +++ b/lmdeploy/turbomind/converter.py @@ -133,6 +133,8 @@ def _resolve_dtype(requested: str, hf_model_cfg) -> str: has_bf16 = is_bf16_supported() dtype = requested if dtype == 'auto': + if getattr(hf_model_cfg, 'text_config', None): + hf_model_cfg = hf_model_cfg.text_config dtype = 'bfloat16' if has_bf16 else 'float16' torch_dtype = getattr(hf_model_cfg, 'dtype', None) if torch_dtype is None: diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py index e81a39fe2a..39f4e85e44 100644 --- a/lmdeploy/turbomind/weight_format.py +++ b/lmdeploy/turbomind/weight_format.py @@ -240,7 +240,12 @@ def dequant(self, tensors, data_type): scales = tensors['scales'] qzeros = tensors['zeros'] group_size = qweight.shape[0] // scales.shape[0] - w = dequantize_gemm(qweight, qzeros, scales, 4, group_size) + if qweight.dtype == torch.int32 and qzeros.dtype == torch.int32: + w = dequantize_gemm(qweight, qzeros, scales, 4, group_size) + else: + w = qweight.unflatten(0, (-1, group_size)) + w = (w - qzeros[:, None]) * scales[:, None] + w = w.flatten(0, 1) result: dict[str, Tensor] = {'weight': w} if 'bias' in tensors: result['bias'] = tensors['bias'] From b88e0aacea0280d02fdd0b3c57e26291d1514cf8 Mon Sep 17 00:00:00 2001 From: irexyc Date: Tue, 9 Jun 2026 12:42:53 +0000 Subject: [PATCH 2/6] add llm_config check --- lmdeploy/turbomind/converter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lmdeploy/turbomind/converter.py b/lmdeploy/turbomind/converter.py index 6936b80af2..7007f84570 100644 --- a/lmdeploy/turbomind/converter.py +++ b/lmdeploy/turbomind/converter.py @@ -135,6 +135,8 @@ def _resolve_dtype(requested: str, hf_model_cfg) -> str: if dtype == 'auto': if getattr(hf_model_cfg, 'text_config', None): hf_model_cfg = hf_model_cfg.text_config + elif getattr(hf_model_cfg, 'llm_config', None): + hf_model_cfg = hf_model_cfg.llm_config dtype = 'bfloat16' if has_bf16 else 'float16' torch_dtype = getattr(hf_model_cfg, 'dtype', None) if torch_dtype is None: From 554d203a9811b04db133059ce5944354c650d5f9 Mon Sep 17 00:00:00 2001 From: irexyc Date: Thu, 11 Jun 2026 05:51:40 +0000 Subject: [PATCH 3/6] remove dequantize_gemm --- lmdeploy/turbomind/weight_format.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py index 39f4e85e44..1a370c36ee 100644 --- a/lmdeploy/turbomind/weight_format.py +++ b/lmdeploy/turbomind/weight_format.py @@ -234,18 +234,13 @@ def pack(self, tensor: Tensor, kind: str) -> PackedTensor: return PackedTensor(tensor, None, None) def dequant(self, tensors, data_type): - from lmdeploy.pytorch.backends.default.awq_modules import dequantize_gemm - qweight = tensors['weight'] scales = tensors['scales'] qzeros = tensors['zeros'] group_size = qweight.shape[0] // scales.shape[0] - if qweight.dtype == torch.int32 and qzeros.dtype == torch.int32: - w = dequantize_gemm(qweight, qzeros, scales, 4, group_size) - else: - w = qweight.unflatten(0, (-1, group_size)) - w = (w - qzeros[:, None]) * scales[:, None] - w = w.flatten(0, 1) + w = qweight.unflatten(0, (-1, group_size)) + w = (w - qzeros[:, None]) * scales[:, None] + w = w.flatten(0, 1) result: dict[str, Tensor] = {'weight': w} if 'bias' in tensors: result['bias'] = tensors['bias'] From 0e098613f4d045d7cec537fc453a90dbf243854e Mon Sep 17 00:00:00 2001 From: irexyc Date: Thu, 11 Jun 2026 13:00:12 +0000 Subject: [PATCH 4/6] add CompressedTensorFormat.dequant --- lmdeploy/turbomind/linear.py | 4 ++-- lmdeploy/turbomind/weight_format.py | 23 ++++++++++++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/lmdeploy/turbomind/linear.py b/lmdeploy/turbomind/linear.py index edbdc94fe1..fc436c8cef 100644 --- a/lmdeploy/turbomind/linear.py +++ b/lmdeploy/turbomind/linear.py @@ -73,8 +73,8 @@ def _dequant_linear(linear: Linear, *, data_type) -> Linear: """Dequantize a quantized Linear to trivial. ``TrivialFormat.dequant`` is identity, so already-trivial inputs round-trip - safely. ``AWQFormat.dequant`` and ``FP8Format.dequant`` do real work. - GPTQ / CompressedTensor / MXFP4 inherit the base-class + safely. ``AWQFormat.dequant``, ``CompressedTensorFormat.dequant`` and + ``FP8Format.dequant`` do real work. GPTQ / MXFP4 inherit the base-class ``NotImplementedError`` — calling ``_dequant_linear`` on one of those is a broken-fusion-group configuration, and the raise names it at the call site. """ diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py index 1a370c36ee..afd79d1a17 100644 --- a/lmdeploy/turbomind/weight_format.py +++ b/lmdeploy/turbomind/weight_format.py @@ -75,9 +75,9 @@ def pack_u4_row(x: torch.Tensor) -> torch.Tensor: def _zeros_int4_symmetric(scales: Tensor) -> Tensor: - """Synthesize symmetric int4 zero-points (value = 8) matching *scales* - shape.""" - return torch.full(scales.shape, 8, dtype=torch.uint8, device=scales.device) + """Synthesize normalized symmetric int4 zero-points (value = 8) matching + *scales* shape.""" + return torch.full(scales.shape, 8, dtype=scales.dtype, device=scales.device) # --------------------------------------------------------------------------- @@ -329,6 +329,23 @@ def pack(self, tensor: Tensor, kind: str) -> PackedTensor: def synthesize_zeros(self, scales: Tensor) -> Tensor: return _zeros_int4_symmetric(scales) + def dequant(self, tensors, data_type): + weight = tensors['weight'] + scales = tensors['scales'] + zeros = tensors['zeros'] + + out_size = weight.shape[-1] + zeros = zeros[..., :out_size] + + scales = scales.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]] + zeros = zeros.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]] + + w = (weight.to(scales.dtype) - zeros.to(scales.dtype)) * scales + result: dict[str, Tensor] = {'weight': w} + if 'bias' in tensors: + result['bias'] = tensors['bias'] + return result + class FP8Format(WeightFormat): name = 'fp8' From 0467a02b46e998b51971188b8ce596a5471d2158 Mon Sep 17 00:00:00 2001 From: Chen Xin Date: Thu, 11 Jun 2026 21:13:51 +0800 Subject: [PATCH 5/6] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- lmdeploy/turbomind/weight_format.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py index afd79d1a17..259997aad8 100644 --- a/lmdeploy/turbomind/weight_format.py +++ b/lmdeploy/turbomind/weight_format.py @@ -338,8 +338,7 @@ def dequant(self, tensors, data_type): zeros = zeros[..., :out_size] scales = scales.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]] - zeros = zeros.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]] - + zeros = zeros.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]] w = (weight.to(scales.dtype) - zeros.to(scales.dtype)) * scales result: dict[str, Tensor] = {'weight': w} if 'bias' in tensors: From 7b692cb0102c8d5d486f905611b1c8743a35b2fc Mon Sep 17 00:00:00 2001 From: irexyc Date: Thu, 11 Jun 2026 13:15:16 +0000 Subject: [PATCH 6/6] Potential fix for pull request finding --- lmdeploy/turbomind/weight_format.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py index 259997aad8..27eb35d586 100644 --- a/lmdeploy/turbomind/weight_format.py +++ b/lmdeploy/turbomind/weight_format.py @@ -332,10 +332,10 @@ def synthesize_zeros(self, scales: Tensor) -> Tensor: def dequant(self, tensors, data_type): weight = tensors['weight'] scales = tensors['scales'] - zeros = tensors['zeros'] + zeros = tensors['zeros'] out_size = weight.shape[-1] - zeros = zeros[..., :out_size] + zeros = zeros[..., :out_size] scales = scales.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]] zeros = zeros.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]]