Tencent
diff --git a/‎angelslim/compressor/diffusion/README.md‎
Lines changed: 66 additions & 0 deletions b/‎angelslim/compressor/diffusion/README.md‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎angelslim/compressor/diffusion/__init__.py‎
Lines changed: 15 additions & 0 deletions b/‎angelslim/compressor/diffusion/__init__.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎angelslim/compressor/diffusion/kernels/__init__.py‎
Lines changed: 13 additions & 0 deletions b/‎angelslim/compressor/diffusion/kernels/__init__.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎angelslim/compressor/diffusion/kernels/python/gemm/__init__.py‎
Lines changed: 17 additions & 0 deletions b/‎angelslim/compressor/diffusion/kernels/python/gemm/__init__.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎angelslim/compressor/diffusion/kernels/python/gemm/fp8_gemm.py‎
Lines changed: 112 additions & 0 deletions b/‎angelslim/compressor/diffusion/kernels/python/gemm/fp8_gemm.py‎
Lines changed: 112 additions & 0 deletions
diff --git a/‎angelslim/compressor/diffusion/kernels/python/quantizers/__init__.py‎
Lines changed: 18 additions & 0 deletions b/‎angelslim/compressor/diffusion/kernels/python/quantizers/__init__.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎angelslim/compressor/diffusion/kernels/python/quantizers/fp8_per_block.py‎
Lines changed: 78 additions & 0 deletions b/‎angelslim/compressor/diffusion/kernels/python/quantizers/fp8_per_block.py‎
Lines changed: 78 additions & 0 deletions
@@ -0,0 +1,66 @@
+# AngelSlim Diffusion Model Compression
+
+AngelSlim offers flexible and efficient tools for compressing Diffusion Transformer (DiT) diffusion models. The quantization utilities are modular and easy to integrate into custom inference pipelines.
+
+## Quick Start: FP8 Quantization for Diffusion Models
+
+```python
+import torch
+from diffusers import FluxPipeline
+from angelslim.compressor.diffusion import DynamicDiTQuantizer
+
+# Load DiT pipeline with bfloat16 to reduce memory usage
+pipe = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16)
+
+# Supported quantization types: "fp8-per-tensor", "fp8-per-block", "fp8-per-token"
+# If you want to use "fp8-per-block" + DeepGEMM on NVIDIA Hopper (SM90+) devices,
+# please refer to https://github.com/deepseek-ai/DeepGEMM for installation instructions.
+quantizer = DynamicDiTQuantizer(quant_type="fp8-per-tensor")
+quantizer.quantize(pipe.transformer)
+
+pipe.to("cuda")
+
+# Run pipeline with FP8-quantized transformer
+image = pipe(
+    "A cat holding a sign that says hello world",
+    guidance_scale=0.0,
+    num_inference_steps=4,
+    max_sequence_length=256,
+    generator=torch.Generator("cuda").manual_seed(0)
+).images[0]
+image.save("flux-schnell_fp8_per_tensor.png")
+```
+
+## Customizable Quantization Layer Selection
+
+AngelSlim provides fine-grained control over which layers are quantized. You can specify inclusion and exclusion patterns as substrings or regular expressions.
+
+```python
+from angelslim.compressor.diffusion import DynamicDiTQuantizer
+
+# Option 1: Default filtering (quantizes common linear layers)
+quantizer = DynamicDiTQuantizer(quant_type="fp8-per-tensor")
+
+# Option 2: String-based include/exclude patterns
+quantizer = DynamicDiTQuantizer(
+    quant_type="fp8-per-tensor",
+    include_patterns=["linear", "attention"],
+    exclude_patterns=["embed", "norm"]
+)
+
+# Option 3: Regex pattern matching (auto-detected)
+quantizer = DynamicDiTQuantizer(
+    quant_type="fp8-per-tensor",
+    include_patterns=[r".*\.linear\d+", r".*\.attn.*"],
+    exclude_patterns=[r".*embed.*"]
+)
+
+# Option 4: Mix of strings and regex for flexible rules
+quantizer = DynamicDiTQuantizer(
+    quant_type="fp8-per-tensor",
+    include_patterns=["linear", r".*\.attn.*"],
+    exclude_patterns=["embed", r".*norm.*"]
+)
+```
+
+For more details on customizing quantization behavior, see the API documentation.
@@ -0,0 +1,15 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .quant import *  # noqa: F401 F403
@@ -0,0 +1,13 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,17 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .fp8_gemm import fp8_gemm_triton_block
+
+__all__ = ["fp8_gemm_triton_block"]
@@ -0,0 +1,112 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import triton
+import triton.language as tl
+
+# modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+fp8_gemm_configs = [
+    triton.Config(
+        {"BLOCK_SIZE_M": block_m, "BLOCK_SIZE_N": block_n, "BLOCK_SIZE_K": 128},
+        num_stages=num_stages,
+        num_warps=8,
+    )
+    for block_m in [16, 32, 64]
+    for block_n in [32, 64, 128]
+    for num_stages in [3, 4, 5, 6]
+]
+
+
+@triton.autotune(configs=fp8_gemm_configs, key=["N", "K"])
+@triton.jit
+def _fp8_gemm_triton_block_kernel(
+    a_ptr,
+    b_ptr,
+    c_ptr,
+    a_s_ptr,
+    b_s_ptr,
+    M,
+    N: tl.constexpr,
+    K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+):
+    """
+    Performs a matrix multiplication operation on FP8 matrices with scaling factors.
+    """
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    k = tl.cdiv(K, BLOCK_SIZE_K)
+    offs_m = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M
+    offs_n = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    offs_k = tl.arange(0, BLOCK_SIZE_K)
+    a_ptrs = a_ptr + offs_m[:, None] * K + offs_k[None, :]
+    b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
+    a_s_ptrs = a_s_ptr + offs_m * k
+    b_s_ptrs = b_s_ptr + (offs_n // BLOCK_SIZE_K) * k
+
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(k):
+        a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0.0)
+        b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0.0)
+        a_s = tl.load(a_s_ptrs)
+        b_s = tl.load(b_s_ptrs)
+
+        accumulator += tl.dot(a, b) * a_s[:, None] * b_s[None, :]
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+        a_s_ptrs += 1
+        b_s_ptrs += 1
+    c = accumulator.to(c_ptr.dtype.element_ty)
+    offs_m = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_n = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    c_ptrs = c_ptr + offs_m[:, None] * N + offs_n[None, :]
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    tl.store(c_ptrs, c, mask=mask)
+
+
+# triton fp8 gemm for fp8 per-block weight & fp8 per-group activation
+# modified from https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+def fp8_gemm_triton_block(
+    a: torch.Tensor,
+    a_s: torch.Tensor,
+    b: torch.Tensor,
+    b_s: torch.Tensor,
+    out_dtype=torch.bfloat16,
+    bias=None,
+) -> torch.Tensor:
+    """
+    Perform a matrix multiplication using FP8 precision.
+    """
+    assert a.is_contiguous() and b.is_contiguous()
+    assert a_s.is_contiguous() and b_s.is_contiguous()
+    K = a.size(-1)
+    M = a.numel() // K
+    N = b.size(0)
+    c = a.new_empty(*a.size()[:-1], N, dtype=out_dtype)
+
+    def grid(meta):
+        return (
+            triton.cdiv(M, meta["BLOCK_SIZE_M"]),
+            triton.cdiv(N, meta["BLOCK_SIZE_N"]),
+        )
+
+    _fp8_gemm_triton_block_kernel[grid](a, b, c, a_s, b_s, M, N, K)
+
+    if bias is not None:
+        c += bias
+
+    return c
@@ -0,0 +1,18 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .fp8_per_block import fp8_per_block_quant_triton
+from .fp8_per_token_group import fp8_per_token_group_quant_triton
+
+__all__ = ["fp8_per_token_group_quant_triton", "fp8_per_block_quant_triton"]
@@ -0,0 +1,78 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import torch
+import triton
+import triton.language as tl
+
+
+# https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
+@triton.jit
+def _fp8_per_block_quant_kernel(x_ptr, y_ptr, s_ptr, M, N, BLOCK_SIZE: tl.constexpr):
+    """Quantizes FP32 tensor to FP8 format using block-wise quantization."""
+    pid_m = tl.program_id(axis=0)
+    pid_n = tl.program_id(axis=1)
+    n = tl.cdiv(N, BLOCK_SIZE)
+
+    offs_m = pid_m * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs_n = pid_n * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    offs = offs_m[:, None] * N + offs_n[None, :]
+
+    mask = (offs_m[:, None] < M) & (offs_n[None, :] < N)
+    x = tl.load(x_ptr + offs, mask=mask).to(tl.float32)
+    max_val = tl.max(tl.abs(x))
+    scale = max_val / 448.0
+    scale = tl.where(max_val == 0.0, 1.0, scale)
+    y = x / scale
+    y = y.to(y_ptr.dtype.element_ty)
+
+    tl.store(y_ptr + offs, y, mask=mask)
+    tl.store(s_ptr + pid_m * n + pid_n, scale)
+
+
+# triton implementation
+# for weight quantization on gpu
+def fp8_per_block_quant_triton(
+    x: torch.Tensor, block_size: int = 128
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantizes a FP32 2D tensor to FP8 (E4M3FN) using block-wise quantization.
+    For each (block_size x block_size) block:
+        - scale = max(abs(block)) / 448.0 (FP8 E4M3FN max magnitude)
+        - if block is all zeros, use scale = 1.0 to avoid div-by-zero
+        - scale, clamp and cast to FP8
+    Returns:
+        y: Quantized FP8 tensor, same shape as input
+        s: Per-block scales, shape (num_blocks_M, num_blocks_N)
+    """
+    assert x.is_contiguous()
+    assert x.dim() == 2
+
+    M, N = x.size()
+    y = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    m_blocks = triton.cdiv(M, block_size)
+    n_blocks = triton.cdiv(N, block_size)
+    s = torch.empty((m_blocks, n_blocks), dtype=torch.float32, device=x.device)
+
+    def grid(meta):
+        return (
+            triton.cdiv(M, meta["BLOCK_SIZE"]),
+            triton.cdiv(N, meta["BLOCK_SIZE"]),
+        )
+
+    _fp8_per_block_quant_kernel[grid](x, y, s, M, N, BLOCK_SIZE=block_size)
+
+    return y, s