add diffusion fp8-per-token-sgl (#169)

StromNoNo · web-flow · commit 7fa71bc2fd4d · 2025-12-11T19:23:48.000+08:00
diff --git a/angelslim/compressor/diffusion/quant/modules/linear.py b/angelslim/compressor/diffusion/quant/modules/linear.py
@@ -19,6 +19,7 @@
     fp8_per_block_quant,
     fp8_per_tensor_quant,
     fp8_per_token_group_quant,
+    fp8_per_token_quant_sgl,
     fp8_weight_only_gemm,
 )
 
@@ -61,6 +62,10 @@ def forward(self, x):
             origin_shape = None
             x_2d = x.view(-1, x.shape[-1])
             qinput, x_scale = fp8_per_token_group_quant(x_2d, x_2d.shape[-1])
+        elif self.quant_type == "fp8-per-token-sgl" and self.native_fp8_support:
+            origin_shape = x.shape
+            x_2d = x.view(-1, x.shape[-1])
+            qinput, x_scale = fp8_per_token_quant_sgl(x_2d)
         elif self.quant_type == "fp8-per-block" and self.native_fp8_support:
             origin_shape = x.shape
             x = x.view(-1, x.shape[-1])
@@ -85,7 +90,11 @@ def forward(self, x):
             origin_shape=origin_shape,
         )
 
-        if self.quant_type == "fp8-per-token" and x.dim() == 3 and output.dim() == 2:
+        if (
+            self.quant_type in ["fp8-per-token", "fp8-per-token-sgl"]
+            and x.dim() == 3
+            and output.dim() == 2
+        ):
             output = output.unsqueeze(0)
 
         return output
diff --git a/angelslim/compressor/diffusion/quant/ptq.py b/angelslim/compressor/diffusion/quant/ptq.py
@@ -23,12 +23,14 @@
 from .modules import FP8DynamicLinear, FP8WeightOnlyLinear
 from .quant_func import (
     fp8_per_block_quant,
+    fp8_per_channel_quant,
     fp8_per_tensor_quant,
     fp8_per_token_group_quant,
 )
 from .utils import (
     QuantType,
     _ensure_deep_gemm,
+    _ensure_sgl_kernel,
     cleanup_memory,
     load_fp8_scales,
     load_quantized_model,
@@ -106,6 +108,10 @@ def _quantize_linear_weight(
                 linear.weight, linear.weight.shape[-1]
             )
             weight_scale = weight_scale.t()
+        elif self.quant_type == QuantType.FP8_PER_TOKEN_SGL:
+            if self.native_fp8_support:
+                _ensure_sgl_kernel()
+            quant_weight, weight_scale = fp8_per_channel_quant(linear.weight)
         elif self.quant_type == QuantType.FP8_PER_BLOCK:
             if self.native_fp8_support:
                 _ensure_deep_gemm()
diff --git a/angelslim/compressor/diffusion/quant/quant_func.py b/angelslim/compressor/diffusion/quant/quant_func.py
@@ -21,7 +21,7 @@
     fp8_per_block_quant_triton,
     fp8_per_token_group_quant_triton,
 )
-from .utils import QuantType, _ensure_deep_gemm
+from .utils import QuantType, _ensure_deep_gemm, _ensure_sgl_kernel
 
 FP8_MAX = float(torch.finfo(torch.float8_e4m3fn).max)
 FP8_MIN = float(torch.finfo(torch.float8_e4m3fn).min)
@@ -87,6 +87,41 @@ def fp8_per_token_group_quant(
     )
 
 
+def fp8_per_channel_quant(weight: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Per-channel FP8 weight quantization (E4M3 format)
+
+    Args:
+        weight: Original weight tensor with shape [out_features, in_features]
+
+    Returns:
+        weight_quant: Quantized weight [out_features, in_features], dtype=float8_e4m3fn
+        weight_scale: Scale factors [out_features, 1], dtype=float32
+    """
+    abs_max = torch.abs(weight).amax(dim=1, keepdim=True)  # [out_features, 1]
+
+    weight_scale = abs_max / FP8_MAX
+    weight_scale = torch.clamp(weight_scale, min=1e-12)
+
+    weight_scaled = (weight / weight_scale).clamp(min=FP8_MIN, max=FP8_MAX)
+    weight_quant = weight_scaled.to(torch.float8_e4m3fn)
+
+    return weight_quant, weight_scale.float()
+
+
+def fp8_per_token_quant_sgl(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    m, k = x.shape
+    input_tensor_quant = torch.empty(
+        (m, k), dtype=torch.float8_e4m3fn, device="cuda", requires_grad=False
+    )
+    input_tensor_scale = torch.empty(
+        (m, 1), dtype=torch.float32, device="cuda", requires_grad=False
+    )
+    _sgl_kernel = _ensure_sgl_kernel()
+    _sgl_kernel.sgl_per_token_quant_fp8(x, input_tensor_quant, input_tensor_scale)
+    return input_tensor_quant, input_tensor_scale
+
+
 # pure torch implementation of block-wise FP8 quantization on cpu
 def fp8_per_block_quant_torch(
     x: torch.Tensor, block_size: int = 128
@@ -260,6 +295,35 @@ def fp8_weight_only_gemm(A, B, B_scale, bias, out_dtype):
     return output
 
 
+def fp8_gemm_sgl_token(A, A_scale, B, B_scale, out_dtype, bias):
+    """GEMM function for FP8 per-token-sgl quantization using sgl-kernel.
+
+    Args:
+        A: Input activation tensor
+        A_scale: Scale tensor for input activations
+        B: Weight tensor
+        B_scale: Scale tensor for weights
+        out_dtype: Output data type.
+        bias: Optional bias tensor
+
+    Returns:
+        torch.Tensor: Result of the GEMM operation.
+    """
+    _sgl_kernel = _ensure_sgl_kernel()
+    shape = (A.shape[0], B.shape[0])
+    output = torch.empty(shape, dtype=out_dtype, device=A.device, requires_grad=False)
+    output = _sgl_kernel.fp8_scaled_mm(
+        A,
+        B.t(),
+        A_scale,
+        B_scale.float(),
+        out_dtype,
+        bias=bias,
+    )
+
+    return output
+
+
 def fp8_gemm(
     A: torch.Tensor,
     A_scale: torch.Tensor,
@@ -300,6 +364,9 @@ def fp8_gemm(
         if quant_type in (QuantType.FP8_PER_TENSOR, QuantType.FP8_PER_TOKEN):
             # Use torch native fp8 GEMM for per-tensor and per-token fp8 quantization
             return fp8_gemm_torch_tensor_token(A, A_scale, B, B_scale, out_dtype, bias)
+        elif quant_type == QuantType.FP8_PER_TOKEN_SGL:
+            # Use sgl-kernel for per-token-sgl fp8 quantization
+            return fp8_gemm_sgl_token(A, A_scale, B, B_scale, out_dtype, bias)
         elif quant_type == QuantType.FP8_PER_BLOCK:
             # Use deepgemm accelerated blockwise fp8 GEMM
             return fp8_gemm_deepgemm_block(
@@ -324,7 +391,8 @@ def fp8_gemm(
         f"\n  native_fp8_support={native_fp8_support}.\n"
         "Supported combinations:\n"
         "  - native_fp8_support=True, "
-        "quant_type in [fp8-per-tensor, fp8-per-token, fp8-per-block]\n"
+        "quant_type in [fp8-per-tensor, fp8-per-token,"
+        " fp8-per-block, fp8-per-token-sgl]\n"
         "  - native_fp8_support=False, "
         "quant_type in [fp8-per-tensor, fp8-per-block]"
     )
diff --git a/angelslim/compressor/diffusion/quant/utils/__init__.py b/angelslim/compressor/diffusion/quant/utils/__init__.py
@@ -17,6 +17,7 @@
     QuantType,
     _compile_pattern,
     _ensure_deep_gemm,
+    _ensure_sgl_kernel,
     cleanup_memory,
     replace_module,
     should_quantize_layer,
@@ -32,4 +33,5 @@
     "cleanup_memory",
     "replace_module",
     "should_quantize_layer",
+    "_ensure_sgl_kernel",
 ]
diff --git a/angelslim/compressor/diffusion/quant/utils/utils.py b/angelslim/compressor/diffusion/quant/utils/utils.py
@@ -25,6 +25,7 @@
     "should_quantize_layer",
     "_compile_pattern",
     "_ensure_deep_gemm",
+    "_ensure_sgl_kernel",
     "QuantType",
 ]
 
@@ -34,11 +35,13 @@ class QuantType:
     FP8_PER_TOKEN = "fp8-per-token"
     FP8_PER_BLOCK = "fp8-per-block"
     FP8_PER_TENSOR_WEIGHT_ONLY = "fp8-per-tensor-weight-only"
+    FP8_PER_TOKEN_SGL = "fp8-per-token-sgl"
     VALID_TYPES = [
         FP8_PER_TENSOR,
         FP8_PER_TOKEN,
         FP8_PER_BLOCK,
         FP8_PER_TENSOR_WEIGHT_ONLY,
+        FP8_PER_TOKEN_SGL,
     ]
 
     @classmethod
@@ -171,3 +174,28 @@ def _ensure_deep_gemm():
                 "native_fp8_support, but was not found. Please install deep_gemm first."
             )
         ) from e
+
+
+_sgl_kernel_cached = None
+
+
+def _ensure_sgl_kernel():
+    """
+    Lazy, safe import of sgl_kernel with process-level caching. Returns the module
+    if available, otherwise raises a clear error.
+    """
+    global _sgl_kernel_cached
+    if _sgl_kernel_cached is not None:
+        return _sgl_kernel_cached
+    try:
+        import sgl_kernel
+
+        _sgl_kernel_cached = sgl_kernel
+        return _sgl_kernel_cached
+    except ImportError as e:
+        raise ImportError(
+            (
+                "sgl_kernel is required for 'fp8-per-token-sgl' quantization with "
+                "native_fp8_support, but was not found. Please install sgl_kernel first"
+            )
+        ) from e
diff --git a/docs/source/features/diffusion/quantization.md b/docs/source/features/diffusion/quantization.md
@@ -10,14 +10,35 @@ AngelSlim 支持以下四种 FP8 量化策略：
 - **fp8-per-tensor-weight-only**：仅对权重量化（权重：FP8，激活仍为 BF16/FP16），适合对精度有更高要求的场景
 - **fp8-per-block**：支持 per-block 量化，适用于 NVIDIA Hopper (SM90+) 架构，block_size目前只支持128
 - **fp8-per-token**：精细的 per-token 量化，对多样输入有更强适应性
+- **fp8-per-token-sgl**：基于 SGL kernel 的 per-token 量化，使用优化的 CUDA kernel 实现更高效的 per-token 量化和矩阵乘法运算
+
+## 可选依赖安装
+### deep_gemm（用于 fp8-per-block）
+
+`fp8-per-block` 量化在启用 `native_fp8_support` 时需要安装 `deep_gemm`：
+
+```shell
+git clone --recursive https://github.com/deepseek-ai/DeepGEMM.git 
+cd DeepGEMM
+./develop.sh
+./install.sh
+```
+
+### sgl_kernel（用于 fp8-per-token-sgl）
+
+`fp8-per-token-sgl` 量化需要安装 `sgl_kernel`：
+
+```shell
+pip install sgl-kernel==0.3.18
+```
 
 ## 配置
 
 DynamicDiTQuantizer 类提供灵活的配置选项，您可以通过以下参数自定义量化行为：
 
 ### 构造函数参数
 
-- `quant_type`（str）：量化类型，可选值 "fp8-per-tensor"、"fp8-per-tensor-weight-only"、"fp8-per-block"、"fp8-per-token"
+- `quant_type`（str）：量化类型，可选值 "fp8-per-tensor"、"fp8-per-tensor-weight-only"、"fp8-per-block"、"fp8-per-token"、"fp8-per-token-sgl"
 - `include_patterns`（List[str|re.Pattern], 可选）：指定需要量化的层名称模式，支持字符串或正则表达式
 - `exclude_patterns`（List[str|re.Pattern], 可选）：指定需要排除的层名称模式，支持字符串或正则表达式
 - `layer_filter`（Callable, 可选）：自定义层筛选函数（高级自定义场景专用）
@@ -166,6 +187,3 @@ quantizer.export_quantized_weight(pipe.transformer, save_path="/path/to/save/qua
 - `fp8_scales.safetensors`：FP8 缩放因子文件
 
 导出后可通过上述"加载预量化模型和缩放因子"的方式加载使用。
-
-
-