Tencent
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 1 addition & 53 deletions b/‎README.md‎
Lines changed: 1 addition & 53 deletions
diff --git a/‎README_cn.md‎
Lines changed: 1 addition & 1 deletion b/‎README_cn.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎angelslim/compressor/_platform.py‎
Lines changed: 2 additions & 3 deletions b/‎angelslim/compressor/_platform.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎angelslim/compressor/compressor_factory.py‎
Lines changed: 1 addition & 3 deletions b/‎angelslim/compressor/compressor_factory.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎angelslim/compressor/diffusion/cache/cache_helper.py‎
Lines changed: 1 addition & 3 deletions b/‎angelslim/compressor/diffusion/cache/cache_helper.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎angelslim/compressor/diffusion/cache/deepcache_helper.py‎
Lines changed: 1 addition & 3 deletions b/‎angelslim/compressor/diffusion/cache/deepcache_helper.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎angelslim/compressor/diffusion/cache/taylorcache_helper.py‎
Lines changed: 8 additions & 18 deletions b/‎angelslim/compressor/diffusion/cache/taylorcache_helper.py‎
Lines changed: 8 additions & 18 deletions
diff --git a/‎angelslim/compressor/diffusion/cache/teacache_helper.py‎
Lines changed: 1 addition & 3 deletions b/‎angelslim/compressor/diffusion/cache/teacache_helper.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎angelslim/compressor/diffusion/kernels/python/gemm/fp8_gemm_torch.py‎
Lines changed: 1 addition & 5 deletions b/‎angelslim/compressor/diffusion/kernels/python/gemm/fp8_gemm_torch.py‎
Lines changed: 1 addition & 5 deletions
@@ -4,7 +4,7 @@ repos:
     hooks:
       - id: black
         name: Black
-        args: [--line-length=88]
+        args: [--line-length=99]
 
   - repo: https://github.com/pycqa/isort
     rev: 5.13.2
@@ -20,6 +20,6 @@ repos:
         name: Flake8
         args: [
             "--ignore=E203,W503,W504",
-            "--max-line-length=88"
+            "--max-line-length=99"
         ]
         additional_dependencies: [flake8-bugbear]
@@ -221,61 +221,9 @@ Alternatively, you can clone the repository and install from source in editable
 cd AngelSlim && python setup.py install
 ```
 
-For more detailed installation instructions, please refer to the [Installation Documentation](https://angelslim.readthedocs.io/zh-cn/latest/getting_started/installation.html).
+For more detailed installation instructions and platform-specific guidance, please refer to the [Installation Documentation](https://angelslim.readthedocs.io/zh-cn/latest/getting_started/installation.html).
 
-#### Windows Installation (with FP8 Triton Support)
 
-AngelSlim supports Windows with FP8 Triton kernels. Follow these steps to build from source:
-
-```batch
-:: Clone the repository
-git clone https://github.com/Tencent/AngelSlim.git
-cd AngelSlim
-
-:: Create and activate virtual environment (Python 3.10 recommended)
-uv venv --python 3.10
-.venv\Scripts\activate
-
-:: Install base dependencies
-uv pip install packaging wheel setuptools ninja numpy==1.26.4 pip build psutil
-
-:: Install PyTorch with CUDA 12.8 support
-uv pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu128
-
-:: Install Triton for Windows
-uv pip install -U triton-windows
-
-:: Configure Visual Studio build environment
-set INCLUDE=
-set LIB=
-set LIBPATH=
-call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
-
-:: Configure CUDA environment
-set CUDA_HOME=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8
-set PATH=%CUDA_HOME%\bin;%PATH%
-set DISTUTILS_USE_SDK=1
-
-:: Set target CUDA architectures (adjust based on your GPU)
-set TORCH_CUDA_ARCH_LIST=8.0;8.6;8.9;9.0
-
-:: Build the wheel
-set DG_USE_LOCAL_VERSION=0
-python setup.py bdist_wheel
-
-:: Verify FP8 Triton kernels are working
-python -c "import torch; from angelslim.compressor.diffusion.kernels.python.quantizers import fp8_per_block_quant_triton; from angelslim.compressor.diffusion.kernels.python.gemm import fp8_gemm_triton_block; a,b=torch.randn(128,256,device='cuda'),torch.randn(512,256,device='cuda'); aq,a_s=fp8_per_block_quant_triton(a); bq,b_s=fp8_per_block_quant_triton(b); c=fp8_gemm_triton_block(aq,a_s,bq,b_s); print(f'FP8 GEMM OK: {c.shape}, {c.dtype}')"
-```
-
-**Requirements:**
-- Windows 10/11 with NVIDIA GPU (Ampere or newer recommended)
-- Visual Studio 2022 with C++ build tools
-- CUDA Toolkit 12.8
-- Python 3.10
-
-**Environment Variables:**
-- `ANGELSLIM_BACKEND`: Force backend selection (`triton` or `pytorch`)
-- `ANGELSLIM_TORCH_COMPILE`: Enable/disable torch.compile (`0` or `1`)
 
 ### 2. Quick Start
 
 
@@ -223,7 +223,7 @@ pip install angelslim
 cd AngelSlim && python setup.py install
 ```
 
-更详细的安装说明可参考[安装文档](https://angelslim.readthedocs.io/zh-cn/latest/getting_started/installation.html)。
+更详细的安装说明以及不同平台的安装指引，可参考[安装文档](https://angelslim.readthedocs.io/zh-cn/latest/getting_started/installation.html)。
 
 ### 2、快速开始
 
 
@@ -28,7 +28,6 @@
 import sys
 from enum import Enum
 from functools import lru_cache
-from typing import Optional
 
 import torch
 
@@ -90,7 +89,7 @@ def is_triton_available() -> bool:
 
     # Try to import triton
     try:
-        import triton
+        import triton  # noqa: F811 F401
 
         # Test if JIT compilation works
         return _test_triton_jit()
@@ -200,7 +199,7 @@ def get_backend_info() -> dict:
         "triton_available": is_triton_available(),
         "torch_compile_supported": is_torch_compile_supported(),
         "cuda_available": torch.cuda.is_available(),
-        "cuda_device": torch.cuda.get_device_name() if torch.cuda.is_available() else None,
+        "cuda_device": (torch.cuda.get_device_name() if torch.cuda.is_available() else None),
         "torch_version": torch.__version__,
         "env_backend": os.environ.get("ANGELSLIM_BACKEND", "auto"),
         "env_torch_compile": os.environ.get("ANGELSLIM_TORCH_COMPILE", "auto"),
 
@@ -37,9 +37,7 @@ def register_class(compress_cls: Type[Any]) -> Type[Any]:
             """Register a class using its own name as the key"""
             key = compress_cls.__name__
             if key in cls._compress_methods:
-                print_info(
-                    f"Compression method '{key}' already exists, will be overwritten."
-                )
+                print_info(f"Compression method '{key}' already exists, will be overwritten.")
             cls._compress_methods[key] = compress_cls
             return compress_cls
 
 
@@ -41,9 +41,7 @@ def enable(self) -> None:
             ValueError: Raised when both double_blocks and single_blocks are empty
         """
         if not self.double_blocks and not self.single_blocks:
-            raise ValueError(
-                "At least one of double_blocks or single_blocks must be provided"
-            )
+            raise ValueError("At least one of double_blocks or single_blocks must be provided")
 
         self.reset_states()
         self.wrap_modules()
 
@@ -34,9 +34,7 @@ def __init__(
             single_blocks=single_blocks,
             no_cache_steps=no_cache_steps,
         )
-        self.no_cache_block_id = (
-            no_cache_block_id if no_cache_block_id is not None else {}
-        )
+        self.no_cache_block_id = no_cache_block_id if no_cache_block_id is not None else {}
 
     def is_skip(self, block_id: int, blocktype: str) -> bool:
         # For some pipelines, the first timestep may not be 0
 
@@ -218,12 +218,8 @@ def __init__(self, max_order: int):
         for i in range(max_order + 1):
             self.register_buffer(f"derivative_{i}_low_freqs", None, persistent=False)
             self.register_buffer(f"derivative_{i}_high_freqs", None, persistent=False)
-            self.register_buffer(
-                f"temp_derivative_{i}_low_freqs", None, persistent=False
-            )
-            self.register_buffer(
-                f"temp_derivative_{i}_high_freqs", None, persistent=False
-            )
+            self.register_buffer(f"temp_derivative_{i}_low_freqs", None, persistent=False)
+            self.register_buffer(f"temp_derivative_{i}_high_freqs", None, persistent=False)
 
     def get_derivative(self, order: int, freqs: str) -> Optional[torch.Tensor]:
         return getattr(self, f"derivative_{order}_{freqs}")
@@ -265,14 +261,10 @@ def taylor_formula(self, distance: int) -> torch.Tensor:
         high_freqs_output = 0
         for i in range(len(self.get_all_filled_derivatives("low_freqs"))):
             coefficient = 1 / math.factorial(i)
-            low_freqs_output += (
-                coefficient * self.get_derivative(i, "low_freqs") * (distance**i)
-            )
+            low_freqs_output += coefficient * self.get_derivative(i, "low_freqs") * (distance**i)
         for i in range(len(self.get_all_filled_derivatives("high_freqs"))):
             coefficient = 1 / math.factorial(i)
-            high_freqs_output += (
-                coefficient * self.get_derivative(i, "high_freqs") * (distance**i)
-            )
+            high_freqs_output += coefficient * self.get_derivative(i, "high_freqs") * (distance**i)
 
         return reconstruction(low_freqs_output, high_freqs_output)
 
@@ -288,18 +280,16 @@ def derivatives_computation(
         self.set_temp_derivative(0, "high_freqs", x_high)
         for i in range(low_freqs_order):
             if self.get_derivative(i, "low_freqs") is not None:
-                derivative_diff = self.get_temp_derivative(
+                derivative_diff = self.get_temp_derivative(i, "low_freqs") - self.get_derivative(
                     i, "low_freqs"
-                ) - self.get_derivative(i, "low_freqs")
+                )
                 self.set_temp_derivative(i + 1, "low_freqs", derivative_diff / distance)
         for i in range(high_freqs_order):
             if self.get_derivative(i, "high_freqs") is not None:
-                derivative_diff = self.get_temp_derivative(
+                derivative_diff = self.get_temp_derivative(i, "high_freqs") - self.get_derivative(
                     i, "high_freqs"
-                ) - self.get_derivative(i, "high_freqs")
-                self.set_temp_derivative(
-                    i + 1, "high_freqs", derivative_diff / distance
                 )
+                self.set_temp_derivative(i + 1, "high_freqs", derivative_diff / distance)
         self.move_temp_to_derivative()
 
     def clear_temp_derivative(self) -> None:
 
@@ -81,9 +81,7 @@ def wrapped_forward(*args, **kwargs):
                 else:
                     is_last_single_block = block_id == len(self.single_blocks) - 1
                     if blocktype == "single_blocks" and is_last_single_block:
-                        img_seq_len = self.cached_output[("double_blocks", 0)][0].shape[
-                            1
-                        ]
+                        img_seq_len = self.cached_output[("double_blocks", 0)][0].shape[1]
                         cached_output = result[:, :img_seq_len, ...]
                         self.previous_residual = cached_output - self.cached_input
 
 
@@ -67,7 +67,6 @@ def fp8_gemm_torch_block(
     # Dequantize A: expand scales to match tensor dimensions
     # a_s shape is typically [M, K//block_size]
     a_s_2d = a_s.view(M, -1)  # [M, num_k_blocks]
-    num_k_blocks = a_s_2d.shape[1]
 
     # Dequantize by expanding scales
     a_dq = _dequantize_per_group(a_2d, a_s_2d, block_size, K)
@@ -121,9 +120,7 @@ def _dequantize_per_group(
     elif s_expanded.shape[1] < K:
         # Pad with last scale value
         pad_size = K - s_expanded.shape[1]
-        s_expanded = torch.nn.functional.pad(
-            s_expanded, (0, pad_size), mode="replicate"
-        )
+        s_expanded = torch.nn.functional.pad(s_expanded, (0, pad_size), mode="replicate")
 
     return x_float * s_expanded
 
@@ -146,7 +143,6 @@ def _dequantize_blockwise_2d(
     """
     N, K = x.shape
     n_blocks, k_blocks = s.shape
-    device = x.device
 
     x_float = x.to(torch.float32)
     y = torch.empty_like(x_float)