feat: Update QuantLLM to v2.1 with new quantization methods and enhanced kernel functionality

codewithdark-git · codewithdark-git · commit c735f58b3fe6 · 2026-04-29T18:14:20.000+05:00
diff --git a/quantllm/__init__.py b/quantllm/__init__.py
@@ -1,5 +1,5 @@
 """
-QuantLLM v2.0 - Ultra-fast LLM Quantization & GGUF Export
+QuantLLM v2.1 - Ultra-fast LLM Quantization & GGUF Export
 
 The simplest way to load, quantize, fine-tune, and export LLMs.
 
@@ -13,16 +13,19 @@
     >>> from quantllm import turbo
     >>> 
     >>> # Load any model (auto-quantizes to 4-bit)
-    >>> model = turbo("meta-llama/Llama-3.2-3B")
+    >>> model = turbo(
+    ...     "meta-llama/Llama-3.2-3B",
+    ...     config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
+    ... )
     >>> 
     >>> # Generate text
     >>> model.generate("Hello, world!")
     >>> 
     >>> # Export to GGUF with Q4_K_M quantization
-    >>> model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
+    >>> model.export()
     >>> 
     >>> # Push to HuggingFace Hub
-    >>> model.push("username/my-model", format="gguf", quantization="Q4_K_M")
+    >>> model.push("username/my-model")
 """
 
 import os
@@ -32,6 +35,7 @@
 from .core import (
     turbo,
     TurboModel,
+    register_architecture,
     SmartConfig,
     HardwareProfiler,
     ModelAnalyzer,
@@ -73,7 +77,7 @@
 # Configure logging (minimal by default)
 configure_logging("WARNING")
 
-__version__ = "2.0.0"
+__version__ = "2.1.0rc1"
 __title__ = "QuantLLM"
 __description__ = "Ultra-fast LLM Quantization & Export (GGUF, ONNX, MLX)"
 __author__ = "Dark Coder"
@@ -114,6 +118,7 @@ def show_banner(force: bool = False):
     # Main API
     "turbo",
     "TurboModel",
+    "register_architecture",
     "SmartConfig",
     "HardwareProfiler",
     "ModelAnalyzer",
diff --git a/quantllm/kernels/__init__.py b/quantllm/kernels/__init__.py
@@ -7,11 +7,17 @@
 from .triton import (
     TritonQuantizedLinear,
     fused_dequant_matmul,
+    int4_matmul,
     is_triton_available,
+    triton_q4_0_quantize,
+    triton_q8_0_quantize,
 )
 
 __all__ = [
     "TritonQuantizedLinear",
     "fused_dequant_matmul",
+    "int4_matmul",
     "is_triton_available",
+    "triton_q4_0_quantize",
+    "triton_q8_0_quantize",
 ]
diff --git a/quantllm/kernels/triton/__init__.py b/quantllm/kernels/triton/__init__.py
@@ -7,11 +7,17 @@
 from .quantized_linear import (
     TritonQuantizedLinear,
     fused_dequant_matmul,
+    int4_matmul,
     is_triton_available,
+    triton_q4_0_quantize,
+    triton_q8_0_quantize,
 )
 
 __all__ = [
     "TritonQuantizedLinear",
     "fused_dequant_matmul",
+    "int4_matmul",
     "is_triton_available",
+    "triton_q4_0_quantize",
+    "triton_q8_0_quantize",
 ]
diff --git a/quantllm/kernels/triton/quantized_linear.py b/quantllm/kernels/triton/quantized_linear.py
@@ -8,7 +8,7 @@
 Performance: ~2-3x faster than separate dequant + matmul
 """
 
-from typing import Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple
 import torch
 import torch.nn as nn
 
@@ -27,6 +27,69 @@ def is_triton_available() -> bool:
     return _TRITON_AVAILABLE
 
 
+def triton_q8_0_quantize(weight: torch.Tensor, eps: float = 1e-8) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize a weight matrix to Q8_0 format (per-column symmetric int8).
+    
+    Returns:
+        qweight: int8 tensor [in_features, out_features]
+        scales: fp tensor [1, out_features]
+    """
+    if weight.dim() != 2:
+        raise ValueError(f"Q8_0 quantization expects a 2D tensor, got shape={tuple(weight.shape)}")
+    
+    max_abs = weight.abs().amax(dim=0, keepdim=True).clamp(min=eps)
+    scale = max_abs / 127.0
+    qweight = torch.clamp(torch.round(weight / scale), -128, 127).to(torch.int8)
+    return qweight, scale.to(weight.dtype)
+
+
+def triton_q4_0_quantize(weight: torch.Tensor, eps: float = 1e-8) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize a weight matrix to Q4_0 format (per-column symmetric 4-bit stored in int8).
+    
+    Returns:
+        qweight: int8 tensor [in_features, out_features] with values in [-8, 7]
+        scales: fp tensor [1, out_features]
+    """
+    if weight.dim() != 2:
+        raise ValueError(f"Q4_0 quantization expects a 2D tensor, got shape={tuple(weight.shape)}")
+    
+    max_abs = weight.abs().amax(dim=0, keepdim=True).clamp(min=eps)
+    scale = max_abs / 7.0
+    qweight = torch.clamp(torch.round(weight / scale), -8, 7).to(torch.int8)
+    return qweight, scale.to(weight.dtype)
+
+
+def int4_matmul(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    INT4 matmul path backed by fused dequant+matmul on CUDA/Triton when available.
+    
+    Args:
+        x: Input [..., in_features]
+        qweight: Quantized int4 values stored in int8, shape [in_features, out_features]
+        scales: Per-column scales, shape [1, out_features] or [in_features/group, out_features]
+        bias: Optional bias [out_features]
+    """
+    # Per-column case uses [1, N] zeros; grouped quantization uses zeros shaped like scales.
+    is_per_column = scales.shape[0] == 1
+    zeros = scales.new_zeros((1, scales.shape[1])) if is_per_column else scales.new_zeros(scales.shape)
+    group_size = qweight.shape[0] if is_per_column else max(qweight.shape[0] // scales.shape[0], 1)
+    return fused_dequant_matmul(
+        x=x,
+        qweight=qweight,
+        scales=scales,
+        zeros=zeros,
+        bias=bias,
+        group_size=group_size,
+    )
+
+
 if _TRITON_AVAILABLE:
     @triton.jit
     def _fused_dequant_matmul_kernel(
@@ -462,3 +525,9 @@ def extra_repr(self) -> str:
             f'group_size={self.group_size}, '
             f'triton={self._use_triton}'
         )
+
+
+triton_quantizers: Dict[str, Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]] = {
+    "q4_0": triton_q4_0_quantize,
+    "q8_0": triton_q8_0_quantize,
+}