ModelTC
diff --git a/‎docs/CN/source/tutorial/api_server_args.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/CN/source/tutorial/api_server_args.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/EN/source/tutorial/api_server_args.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/EN/source/tutorial/api_server_args.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/quantization/fp8act_quant_kernel.py‎
Lines changed: 10 additions & 2 deletions b/‎lightllm/common/basemodel/triton_kernel/quantization/fp8act_quant_kernel.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_perchannel_quant_kernel.py‎
Lines changed: 49 additions & 0 deletions b/‎lightllm/common/basemodel/triton_kernel/quantization/fp8w8a8_perchannel_quant_kernel.py‎
Lines changed: 49 additions & 0 deletions
@@ -384,6 +384,8 @@ PD 分离模式参数
     * ``vllm-fp8w8a8-b128``
     * ``deepgemm-fp8w8a8-b128``
     * ``triton-fp8w8a8-block128``
+    * ``triton-fp8w8a8g128``: 权重 per-channel 量化和激活 per-group 128 量化
+    * ``triton-fp8w8a8g64``: 权重 per-channel 量化, group size 64
     * ``awq``
     * ``awq_marlin``
     * ``none`` (默认)
 
@@ -376,6 +376,8 @@ Quantization Parameters
     * ``vllm-fp8w8a8-b128``
     * ``deepgemm-fp8w8a8-b128``
     * ``triton-fp8w8a8-block128``
+    * ``triton-fp8w8a8g128``: weight per-channel quant and activation per-group 128 quant
+    * ``triton-fp8w8a8g64``: weight per-channel quantization with group size 64
     * ``awq``
     * ``awq_marlin``
     * ``none`` (default)
 
@@ -29,6 +29,7 @@ def _per_token_group_quant_fp8(
     xs_n,
     xs_row_major: tl.constexpr,
     BLOCK: tl.constexpr,
+    NEED_MASK: tl.constexpr,
 ):
     g_id = tl.program_id(0)
     y_ptr += g_id * y_stride
@@ -41,9 +42,15 @@ def _per_token_group_quant_fp8(
         y_s_ptr += col_id * xs_m + row_id  # col major
 
     cols = tl.arange(0, BLOCK)  # N <= BLOCK
-    mask = cols < N
 
-    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    if NEED_MASK:
+        mask = cols < N
+        other = 0.0
+    else:
+        mask = None
+        other = None
+
+    y = tl.load(y_ptr + cols, mask=mask, other=other).to(tl.float32)
     # Quant
     _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
     y_s = _absmax / fp8_max
@@ -99,6 +106,7 @@ def lightllm_per_token_group_quant_fp8(
         xs_n=xs_n,
         xs_row_major=xs_row_major,
         BLOCK=BLOCK,
+        NEED_MASK=BLOCK != group_size,
         num_warps=num_warps,
         num_stages=num_stages,
     )
 
@@ -0,0 +1,49 @@
+import torch
+import triton
+import triton.language as tl
+from lightllm.utils.dist_utils import get_current_device_id
+
+
+@triton.jit
+def weight_quant_kernel(x_ptr, s_ptr, y_ptr, M, N, BLOCK_N: tl.constexpr):
+    m_index = tl.program_id(axis=0)
+
+    offs_n = tl.arange(0, BLOCK_N)
+    mask = offs_n < N
+
+    x = tl.load(x_ptr + m_index * N + offs_n, mask=mask, other=0.0).to(tl.float32)
+
+    amax = tl.max(tl.abs(x))
+
+    max_fp8e4m3_val = 448.0
+    scale = amax / max_fp8e4m3_val
+    y = (x / (scale + 1e-6)).to(y_ptr.dtype.element_ty)
+
+    tl.store(y_ptr + m_index * N + offs_n, y, mask=mask)
+    tl.store(s_ptr + m_index, scale)
+
+
+def mm_weight_quant(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.is_contiguous(), "Input tensor must be contiguous"
+    M, N = x.size()
+
+    y_quant = torch.empty((M, N), dtype=torch.float8_e4m3fn, device=x.device)
+    s_scales = torch.empty((M, 1), dtype=torch.float32, device=x.device)
+
+    grid = (M,)
+    weight_quant_kernel[grid](x, s_scales, y_quant, M, N, BLOCK_N=triton.next_power_of_2(N), num_warps=16)
+    return y_quant, s_scales
+
+
+def weight_quant(x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+    assert x.is_contiguous(), "Input tensor must be contiguous"
+    x = x.cuda(get_current_device_id())
+    if x.dim() == 3:
+        y_quant = torch.empty((x.shape[0], x.shape[1], x.shape[2]), dtype=torch.float8_e4m3fn, device=x.device)
+        s_scales = torch.empty((x.shape[0], x.shape[1], 1), dtype=torch.float32, device=x.device)
+        for i in range(x.shape[0]):
+            y_quant[i], s_scales[i] = mm_weight_quant(x[i])
+        return y_quant, s_scales
+    else:
+        y_quant, s_scales = mm_weight_quant(x)
+        return y_quant, s_scales