Tencent
diff --git a/‎README.md‎
Lines changed: 54 additions & 0 deletions b/‎README.md‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎angelslim/compressor/_platform.py‎
Lines changed: 207 additions & 0 deletions b/‎angelslim/compressor/_platform.py‎
Lines changed: 207 additions & 0 deletions
diff --git a/‎angelslim/compressor/diffusion/cache/taylorcache_helper.py‎
Lines changed: 19 additions & 3 deletions b/‎angelslim/compressor/diffusion/cache/taylorcache_helper.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎angelslim/compressor/diffusion/kernels/python/gemm/__init__.py‎
Lines changed: 19 additions & 2 deletions b/‎angelslim/compressor/diffusion/kernels/python/gemm/__init__.py‎
Lines changed: 19 additions & 2 deletions
@@ -223,6 +223,60 @@ cd AngelSlim && python setup.py install
 
 For more detailed installation instructions, please refer to the [Installation Documentation](https://angelslim.readthedocs.io/zh-cn/latest/getting_started/installation.html).
 
+#### Windows Installation (with FP8 Triton Support)
+
+AngelSlim supports Windows with FP8 Triton kernels. Follow these steps to build from source:
+
+```batch
+:: Clone the repository
+git clone https://github.com/Tencent/AngelSlim.git
+cd AngelSlim
+
+:: Create and activate virtual environment (Python 3.10 recommended)
+uv venv --python 3.10
+.venv\Scripts\activate
+
+:: Install base dependencies
+uv pip install packaging wheel setuptools ninja numpy==1.26.4 pip build psutil
+
+:: Install PyTorch with CUDA 12.8 support
+uv pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu128
+
+:: Install Triton for Windows
+uv pip install -U triton-windows
+
+:: Configure Visual Studio build environment
+set INCLUDE=
+set LIB=
+set LIBPATH=
+call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
+
+:: Configure CUDA environment
+set CUDA_HOME=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8
+set PATH=%CUDA_HOME%\bin;%PATH%
+set DISTUTILS_USE_SDK=1
+
+:: Set target CUDA architectures (adjust based on your GPU)
+set TORCH_CUDA_ARCH_LIST=8.0;8.6;8.9;9.0
+
+:: Build the wheel
+set DG_USE_LOCAL_VERSION=0
+python setup.py bdist_wheel
+
+:: Verify FP8 Triton kernels are working
+python -c "import torch; from angelslim.compressor.diffusion.kernels.python.quantizers import fp8_per_block_quant_triton; from angelslim.compressor.diffusion.kernels.python.gemm import fp8_gemm_triton_block; a,b=torch.randn(128,256,device='cuda'),torch.randn(512,256,device='cuda'); aq,a_s=fp8_per_block_quant_triton(a); bq,b_s=fp8_per_block_quant_triton(b); c=fp8_gemm_triton_block(aq,a_s,bq,b_s); print(f'FP8 GEMM OK: {c.shape}, {c.dtype}')"
+```
+
+**Requirements:**
+- Windows 10/11 with NVIDIA GPU (Ampere or newer recommended)
+- Visual Studio 2022 with C++ build tools
+- CUDA Toolkit 12.8
+- Python 3.10
+
+**Environment Variables:**
+- `ANGELSLIM_BACKEND`: Force backend selection (`triton` or `pytorch`)
+- `ANGELSLIM_TORCH_COMPILE`: Enable/disable torch.compile (`0` or `1`)
+
 ### 2. Quick Start
 
 #### 2.1 Speculative Decoding
 
@@ -0,0 +1,207 @@
+# Copyright 2025 Tencent Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Platform detection and backend selection for AngelSlim.
+
+This module provides utilities for detecting the runtime environment
+and selecting appropriate backends (Triton vs PyTorch) based on
+platform capabilities.
+
+Environment Variables:
+    ANGELSLIM_BACKEND: Force backend selection ("triton" or "pytorch")
+    ANGELSLIM_TORCH_COMPILE: Enable/disable torch.compile ("0" or "1")
+"""
+
+import os
+import sys
+from enum import Enum
+from functools import lru_cache
+from typing import Optional
+
+import torch
+
+
+class Platform(Enum):
+    """Supported platforms."""
+
+    LINUX = "linux"
+    WINDOWS = "windows"
+    MACOS = "macos"
+    UNKNOWN = "unknown"
+
+
+class Backend(Enum):
+    """Available computation backends."""
+
+    TRITON = "triton"
+    PYTORCH = "pytorch"
+
+
+@lru_cache(maxsize=1)
+def get_platform() -> Platform:
+    """Detect the current platform."""
+    if sys.platform.startswith("linux"):
+        return Platform.LINUX
+    elif sys.platform == "win32":
+        return Platform.WINDOWS
+    elif sys.platform == "darwin":
+        return Platform.MACOS
+    return Platform.UNKNOWN
+
+
+@lru_cache(maxsize=1)
+def is_triton_available() -> bool:
+    """
+    Check if Triton is available and functional.
+
+    Returns:
+        bool: True if Triton can be used, False otherwise.
+    """
+    # Check environment variable override
+    env_backend = os.environ.get("ANGELSLIM_BACKEND", "").lower()
+    if env_backend == "pytorch":
+        return False
+    if env_backend == "triton":
+        # User explicitly requested Triton, try to use it
+        try:
+            import triton
+
+            if not torch.cuda.is_available():
+                raise RuntimeError("ANGELSLIM_BACKEND=triton but CUDA is not available")
+            return True
+        except ImportError:
+            raise RuntimeError("ANGELSLIM_BACKEND=triton but triton is not installed")
+
+    # Auto-detection: check CUDA availability first
+    if not torch.cuda.is_available():
+        return False
+
+    # Try to import triton
+    try:
+        import triton
+
+        # Test if JIT compilation works
+        return _test_triton_jit()
+    except ImportError:
+        return False
+    except Exception:
+        return False
+
+
+def _test_triton_jit() -> bool:
+    """
+    Test if Triton JIT compilation actually works.
+
+    This is needed because triton-windows may import but fail at JIT time.
+    """
+    try:
+        import triton
+        import triton.language as tl
+
+        @triton.jit
+        def _test_kernel(x_ptr, BLOCK: tl.constexpr):
+            pid = tl.program_id(0)
+            offs = pid * BLOCK + tl.arange(0, BLOCK)
+            x = tl.load(x_ptr + offs)
+            tl.store(x_ptr + offs, x + 1.0)
+
+        # Try to compile and run the kernel
+        x = torch.zeros(128, device="cuda", dtype=torch.float32)
+        _test_kernel[(1,)](x, BLOCK=128)
+        torch.cuda.synchronize()
+
+        # Verify the kernel ran correctly
+        return torch.allclose(x, torch.ones(128, device="cuda", dtype=torch.float32))
+    except Exception:
+        return False
+
+
+@lru_cache(maxsize=1)
+def get_default_backend() -> Backend:
+    """
+    Get the default computation backend for the current environment.
+
+    Priority:
+    1. ANGELSLIM_BACKEND environment variable
+    2. Triton if available and functional
+    3. PyTorch fallback
+
+    Returns:
+        Backend: The selected backend.
+    """
+    if is_triton_available():
+        return Backend.TRITON
+    return Backend.PYTORCH
+
+
+@lru_cache(maxsize=1)
+def is_torch_compile_supported() -> bool:
+    """
+    Check if torch.compile is supported and should be enabled.
+
+    Returns:
+        bool: True if torch.compile should be used.
+    """
+    # Check environment variable override
+    env_compile = os.environ.get("ANGELSLIM_TORCH_COMPILE", "").lower()
+    if env_compile == "0" or env_compile == "false":
+        return False
+    if env_compile == "1" or env_compile == "true":
+        return True
+
+    # Windows: torch.compile has issues with dynamo
+    if get_platform() == Platform.WINDOWS:
+        return False
+
+    # Check PyTorch version (torch.compile requires 2.0+)
+    try:
+        version_parts = torch.__version__.split(".")[:2]
+        major = int(version_parts[0])
+        if major < 2:
+            return False
+    except Exception:
+        return False
+
+    return True
+
+
+def use_triton() -> bool:
+    """Check if Triton backend should be used."""
+    return get_default_backend() == Backend.TRITON
+
+
+def use_pytorch() -> bool:
+    """Check if PyTorch fallback should be used."""
+    return get_default_backend() == Backend.PYTORCH
+
+
+def get_backend_info() -> dict:
+    """
+    Get detailed information about the current backend configuration.
+
+    Returns:
+        dict: Backend information including platform, backend, and capabilities.
+    """
+    return {
+        "platform": get_platform().value,
+        "backend": get_default_backend().value,
+        "triton_available": is_triton_available(),
+        "torch_compile_supported": is_torch_compile_supported(),
+        "cuda_available": torch.cuda.is_available(),
+        "cuda_device": torch.cuda.get_device_name() if torch.cuda.is_available() else None,
+        "torch_version": torch.__version__,
+        "env_backend": os.environ.get("ANGELSLIM_BACKEND", "auto"),
+        "env_torch_compile": os.environ.get("ANGELSLIM_TORCH_COMPILE", "auto"),
+    }
@@ -1,11 +1,27 @@
 import math
-from typing import Any, List, Optional, Set, Tuple
+from typing import Any, Callable, List, Optional, Set, Tuple
 
 import torch
 import torch.nn as nn
 
 from .cache_helper import CacheHelper
 
+# Conditional torch.compile decorator
+# Disabled on Windows and when ANGELSLIM_TORCH_COMPILE=0
+try:
+    from angelslim.compressor._platform import is_torch_compile_supported
+
+    _USE_TORCH_COMPILE = is_torch_compile_supported()
+except ImportError:
+    _USE_TORCH_COMPILE = False
+
+
+def _conditional_compile(func: Callable) -> Callable:
+    """Apply torch.compile only if supported on this platform."""
+    if _USE_TORCH_COMPILE:
+        return torch.compile(func)
+    return func
+
 
 class TaylorCacheHelper(CacheHelper):
     """
@@ -137,7 +153,7 @@ def clear_states(self) -> None:
         self.taylor_cache.clear_derivatives()
 
 
-@torch.compile
+@_conditional_compile
 def decomposition_FFT(
     x: torch.Tensor, cutoff_ratio: float = 0.1
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -188,7 +204,7 @@ def decomposition_FFT(
     return low, high
 
 
-@torch.compile
+@_conditional_compile
 def reconstruction(low_freq: torch.Tensor, high_freq: torch.Tensor) -> torch.Tensor:
     return low_freq + high_freq
 
 
@@ -12,6 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .fp8_gemm import fp8_gemm_triton_block
+"""
+FP8 GEMM kernels with automatic backend selection.
 
-__all__ = ["fp8_gemm_triton_block"]
+This module automatically selects between Triton (for Linux/CUDA) and
+PyTorch (for Windows/CPU) implementations based on the runtime environment.
+"""
+
+from angelslim.compressor._platform import use_triton
+
+# Conditional imports based on platform/backend availability
+if use_triton():
+    from .fp8_gemm import fp8_gemm_triton_block
+else:
+    # PyTorch fallback implementation
+    from .fp8_gemm_torch import fp8_gemm_torch_block as fp8_gemm_triton_block
+
+# Also export PyTorch version directly for explicit use
+from .fp8_gemm_torch import fp8_gemm_torch_block
+
+__all__ = ["fp8_gemm_triton_block", "fp8_gemm_torch_block"]