From 6c473b4f90fc40f52ab7a786e2c6dfc5175fc013 Mon Sep 17 00:00:00 2001
From: irexyc <irexyc@gmail.com>
Date: Mon, 8 Jun 2026 07:44:56 +0000
Subject: [PATCH 1/6] fix dequant_mixed

---
 lmdeploy/turbomind/converter.py     | 2 ++
 lmdeploy/turbomind/weight_format.py | 7 ++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/lmdeploy/turbomind/converter.py b/lmdeploy/turbomind/converter.py
index e8e592c512..6936b80af2 100644
--- a/lmdeploy/turbomind/converter.py
+++ b/lmdeploy/turbomind/converter.py
@@ -133,6 +133,8 @@ def _resolve_dtype(requested: str, hf_model_cfg) -> str:
     has_bf16 = is_bf16_supported()
     dtype = requested
     if dtype == 'auto':
+        if getattr(hf_model_cfg, 'text_config', None):
+            hf_model_cfg = hf_model_cfg.text_config
         dtype = 'bfloat16' if has_bf16 else 'float16'
         torch_dtype = getattr(hf_model_cfg, 'dtype', None)
         if torch_dtype is None:
diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py
index e81a39fe2a..39f4e85e44 100644
--- a/lmdeploy/turbomind/weight_format.py
+++ b/lmdeploy/turbomind/weight_format.py
@@ -240,7 +240,12 @@ def dequant(self, tensors, data_type):
         scales  = tensors['scales']
         qzeros  = tensors['zeros']
         group_size = qweight.shape[0] // scales.shape[0]
-        w = dequantize_gemm(qweight, qzeros, scales, 4, group_size)
+        if qweight.dtype == torch.int32 and qzeros.dtype == torch.int32:
+            w = dequantize_gemm(qweight, qzeros, scales, 4, group_size)
+        else:
+            w = qweight.unflatten(0, (-1, group_size))
+            w = (w - qzeros[:, None]) * scales[:, None]
+            w = w.flatten(0, 1)
         result: dict[str, Tensor] = {'weight': w}
         if 'bias' in tensors:
             result['bias'] = tensors['bias']

From b88e0aacea0280d02fdd0b3c57e26291d1514cf8 Mon Sep 17 00:00:00 2001
From: irexyc <irexyc@gmail.com>
Date: Tue, 9 Jun 2026 12:42:53 +0000
Subject: [PATCH 2/6] add llm_config check

---
 lmdeploy/turbomind/converter.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lmdeploy/turbomind/converter.py b/lmdeploy/turbomind/converter.py
index 6936b80af2..7007f84570 100644
--- a/lmdeploy/turbomind/converter.py
+++ b/lmdeploy/turbomind/converter.py
@@ -135,6 +135,8 @@ def _resolve_dtype(requested: str, hf_model_cfg) -> str:
     if dtype == 'auto':
         if getattr(hf_model_cfg, 'text_config', None):
             hf_model_cfg = hf_model_cfg.text_config
+        elif getattr(hf_model_cfg, 'llm_config', None):
+            hf_model_cfg = hf_model_cfg.llm_config
         dtype = 'bfloat16' if has_bf16 else 'float16'
         torch_dtype = getattr(hf_model_cfg, 'dtype', None)
         if torch_dtype is None:

From 554d203a9811b04db133059ce5944354c650d5f9 Mon Sep 17 00:00:00 2001
From: irexyc <irexyc@gmail.com>
Date: Thu, 11 Jun 2026 05:51:40 +0000
Subject: [PATCH 3/6] remove dequantize_gemm

---
 lmdeploy/turbomind/weight_format.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py
index 39f4e85e44..1a370c36ee 100644
--- a/lmdeploy/turbomind/weight_format.py
+++ b/lmdeploy/turbomind/weight_format.py
@@ -234,18 +234,13 @@ def pack(self, tensor: Tensor, kind: str) -> PackedTensor:
         return PackedTensor(tensor, None, None)
 
     def dequant(self, tensors, data_type):
-        from lmdeploy.pytorch.backends.default.awq_modules import dequantize_gemm
-
         qweight = tensors['weight']
         scales  = tensors['scales']
         qzeros  = tensors['zeros']
         group_size = qweight.shape[0] // scales.shape[0]
-        if qweight.dtype == torch.int32 and qzeros.dtype == torch.int32:
-            w = dequantize_gemm(qweight, qzeros, scales, 4, group_size)
-        else:
-            w = qweight.unflatten(0, (-1, group_size))
-            w = (w - qzeros[:, None]) * scales[:, None]
-            w = w.flatten(0, 1)
+        w = qweight.unflatten(0, (-1, group_size))
+        w = (w - qzeros[:, None]) * scales[:, None]
+        w = w.flatten(0, 1)
         result: dict[str, Tensor] = {'weight': w}
         if 'bias' in tensors:
             result['bias'] = tensors['bias']

From 0e098613f4d045d7cec537fc453a90dbf243854e Mon Sep 17 00:00:00 2001
From: irexyc <irexyc@gmail.com>
Date: Thu, 11 Jun 2026 13:00:12 +0000
Subject: [PATCH 4/6] add CompressedTensorFormat.dequant

---
 lmdeploy/turbomind/linear.py        |  4 ++--
 lmdeploy/turbomind/weight_format.py | 23 ++++++++++++++++++++---
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/lmdeploy/turbomind/linear.py b/lmdeploy/turbomind/linear.py
index edbdc94fe1..fc436c8cef 100644
--- a/lmdeploy/turbomind/linear.py
+++ b/lmdeploy/turbomind/linear.py
@@ -73,8 +73,8 @@ def _dequant_linear(linear: Linear, *, data_type) -> Linear:
     """Dequantize a quantized Linear to trivial.
 
     ``TrivialFormat.dequant`` is identity, so already-trivial inputs round-trip
-    safely.  ``AWQFormat.dequant`` and ``FP8Format.dequant`` do real work.
-    GPTQ / CompressedTensor / MXFP4 inherit the base-class
+    safely.  ``AWQFormat.dequant``, ``CompressedTensorFormat.dequant`` and
+    ``FP8Format.dequant`` do real work. GPTQ / MXFP4 inherit the base-class
     ``NotImplementedError`` — calling ``_dequant_linear`` on one of those is a
     broken-fusion-group configuration, and the raise names it at the call site.
     """
diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py
index 1a370c36ee..afd79d1a17 100644
--- a/lmdeploy/turbomind/weight_format.py
+++ b/lmdeploy/turbomind/weight_format.py
@@ -75,9 +75,9 @@ def pack_u4_row(x: torch.Tensor) -> torch.Tensor:
 
 
 def _zeros_int4_symmetric(scales: Tensor) -> Tensor:
-    """Synthesize symmetric int4 zero-points (value = 8) matching *scales*
-    shape."""
-    return torch.full(scales.shape, 8, dtype=torch.uint8, device=scales.device)
+    """Synthesize normalized symmetric int4 zero-points (value = 8) matching
+    *scales* shape."""
+    return torch.full(scales.shape, 8, dtype=scales.dtype, device=scales.device)
 
 
 # ---------------------------------------------------------------------------
@@ -329,6 +329,23 @@ def pack(self, tensor: Tensor, kind: str) -> PackedTensor:
     def synthesize_zeros(self, scales: Tensor) -> Tensor:
         return _zeros_int4_symmetric(scales)
 
+    def dequant(self, tensors, data_type):
+        weight = tensors['weight']
+        scales = tensors['scales']
+        zeros  = tensors['zeros']
+
+        out_size = weight.shape[-1]
+        zeros  = zeros[..., :out_size]
+
+        scales = scales.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]]
+        zeros  = zeros.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]]
+
+        w = (weight.to(scales.dtype) - zeros.to(scales.dtype)) * scales
+        result: dict[str, Tensor] = {'weight': w}
+        if 'bias' in tensors:
+            result['bias'] = tensors['bias']
+        return result
+
 
 class FP8Format(WeightFormat):
     name           = 'fp8'

From 0467a02b46e998b51971188b8ce596a5471d2158 Mon Sep 17 00:00:00 2001
From: Chen Xin <irexyc@gmail.com>
Date: Thu, 11 Jun 2026 21:13:51 +0800
Subject: [PATCH 5/6] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 lmdeploy/turbomind/weight_format.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py
index afd79d1a17..259997aad8 100644
--- a/lmdeploy/turbomind/weight_format.py
+++ b/lmdeploy/turbomind/weight_format.py
@@ -338,8 +338,7 @@ def dequant(self, tensors, data_type):
         zeros  = zeros[..., :out_size]
 
         scales = scales.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]]
-        zeros  = zeros.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]]
-
+        zeros = zeros.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]]
         w = (weight.to(scales.dtype) - zeros.to(scales.dtype)) * scales
         result: dict[str, Tensor] = {'weight': w}
         if 'bias' in tensors:

From 7b692cb0102c8d5d486f905611b1c8743a35b2fc Mon Sep 17 00:00:00 2001
From: irexyc <irexyc@gmail.com>
Date: Thu, 11 Jun 2026 13:15:16 +0000
Subject: [PATCH 6/6] Potential fix for pull request finding

---
 lmdeploy/turbomind/weight_format.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lmdeploy/turbomind/weight_format.py b/lmdeploy/turbomind/weight_format.py
index 259997aad8..27eb35d586 100644
--- a/lmdeploy/turbomind/weight_format.py
+++ b/lmdeploy/turbomind/weight_format.py
@@ -332,10 +332,10 @@ def synthesize_zeros(self, scales: Tensor) -> Tensor:
     def dequant(self, tensors, data_type):
         weight = tensors['weight']
         scales = tensors['scales']
-        zeros  = tensors['zeros']
+        zeros = tensors['zeros']
 
         out_size = weight.shape[-1]
-        zeros  = zeros[..., :out_size]
+        zeros = zeros[..., :out_size]
 
         scales = scales.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]]
         zeros = zeros.repeat_interleave(self.block_in, dim=0)[:weight.shape[0]]