Merge pull request #23 from codewithdark-git/copilot/v2-1-gpu-acceleration-memory-optimization

codewithdark-git · web-flow · commit da272cc88f49 · 2026-04-23T17:00:13.000+05:00
Add v2.1 performance primitives: Triton Q4_0/Q8_0 + INT4 matmul and memory-aware GGUF export
diff --git a/docs/guide/gguf-export.md b/docs/guide/gguf-export.md
@@ -209,11 +209,30 @@ model.export("gguf", "model.gguf", quantization="Q4_K_M")
 For very large models:
 
 ```python
+# Note: previous `streaming=True` guidance is superseded by `chunked_conversion=True`.
+# If you previously used `streaming=True`, replace it with `chunked_conversion=True` (streaming has no effect here).
+
 # Use lower quantization
 model.export("gguf", "model.Q3_K_M.gguf", quantization="Q3_K_M")
 
-# Or export with streaming (reduces memory)
-model.export("gguf", "model.gguf", quantization="Q4_K_M", streaming=True)
+# Enable chunked conversion + smart ordering
+model.export(
+    "gguf",
+    "model.gguf",
+    quantization="Q4_K_M",
+    chunked_conversion=True,
+    max_shard_size="2GB",
+    smart_tensor_ordering=True,
+)
+
+# Force intermediate files to a dedicated disk offload directory
+model.export(
+    "gguf",
+    "model.gguf",
+    quantization="Q4_K_M",
+    disk_offloading=True,
+    disk_offload_dir="./quantllm_offload",
+)
 ```
 
 ### Windows Issues
diff --git a/quantllm/core/memory.py b/quantllm/core/memory.py
@@ -14,6 +14,7 @@
 import gc
 from typing import Optional, Dict, Any, List, Union, Callable
 from contextlib import contextmanager
+from collections import OrderedDict
 import torch
 import torch.nn as nn
 
@@ -186,6 +187,26 @@ def estimate_model_memory(
         }
 
 
+def memory_optimized_tensor_order(
+    state_dict: Dict[str, torch.Tensor],
+    *,
+    prioritize_large_tensors: bool = True,
+) -> OrderedDict[str, torch.Tensor]:
+    """
+    Return an ordered state dict to reduce peak memory pressure during serialization.
+    
+    By default, tensors are sorted by total byte size (numel * element_size),
+    with larger tensors emitted first to reduce long-lived allocator pressure
+    in shard-based writes on very large checkpoints.
+    """
+    sorted_items = sorted(
+        state_dict.items(),
+        key=lambda kv: kv[1].numel() * kv[1].element_size(),
+        reverse=prioritize_large_tensors,
+    )
+    return OrderedDict(sorted_items)
+
+
 class DynamicOffloader:
     """
     Dynamic layer offloading for large models.
diff --git a/quantllm/core/turbo_model.py b/quantllm/core/turbo_model.py
@@ -23,6 +23,9 @@
 from ..utils import logger, print_header, print_success, print_error, print_info, print_warning, QuantLLMProgress
 from transformers.utils.logging import disable_progress_bar as disable_hf_progress_bar
 from datasets.utils.logging import disable_progress_bar as disable_ds_progress_bar
+from .memory import memory_optimized_tensor_order
+
+DEFAULT_CHUNKED_SHARD_SIZE = "2GB"
 
 
 class TurboModel:
@@ -1127,6 +1130,11 @@ def _export_gguf(
         output_path: str, 
         quantization: Optional[str] = None,
         fast_mode: bool = False,
+        chunked_conversion: bool = False,
+        max_shard_size: Optional[str] = None,
+        smart_tensor_ordering: bool = False,
+        disk_offloading: bool = False,
+        disk_offload_dir: Optional[str] = None,
         **kwargs
     ) -> str:
         """
@@ -1144,13 +1152,22 @@ def _export_gguf(
             output_path: Output file path for GGUF
             quantization: Quantization type (Q4_K_M, Q5_K_M, Q8_0, etc.)
             fast_mode: Skip intermediate F16 step for faster export (slightly less optimal)
+            chunked_conversion: Save model shards during conversion for large checkpoints
+            max_shard_size: Max shard size used when chunked conversion is active
+            smart_tensor_ordering: Save tensors in memory-optimized order
+            disk_offloading: Use a dedicated temp/offload directory for intermediate artifacts
+            disk_offload_dir: Directory used when disk_offloading=True
         """
         from ..quant import convert_to_gguf, quantize_gguf, ensure_llama_cpp_installed, GGUF_QUANT_TYPES
         from ..utils import QuantLLMProgress, format_time, format_size
         import time
         
         start_time = time.time()
         
+        effective_shard_size = max_shard_size or (
+            DEFAULT_CHUNKED_SHARD_SIZE if chunked_conversion else None
+        )
+        
         quant_type = quantization or self.config.quant_type or "q4_k_m"
         quant_type_upper = quant_type.upper()
         quant_type_lower = quant_type.lower()
@@ -1163,6 +1180,13 @@ def _export_gguf(
             print_info(f"Target quantization: {quant_type_upper}")
             if fast_mode:
                 print_info("Fast mode enabled")
+            if chunked_conversion:
+                print_info(f"Chunked conversion enabled (max_shard_size={effective_shard_size})")
+            if smart_tensor_ordering:
+                print_info("Smart tensor ordering enabled")
+                print_warning("Smart tensor ordering may temporarily materialize a full state dict in memory.")
+            if disk_offloading:
+                print_info(f"Disk offloading enabled ({disk_offload_dir or 'system temp'})")
         
         # Ensure llama.cpp
         if self.verbose:
@@ -1188,21 +1212,35 @@ def _export_gguf(
         # Get model name for file naming
         model_name = self.model.config._name_or_path.split('/')[-1]
         
+        temp_parent = disk_offload_dir if disk_offloading else None
+        if temp_parent:
+            os.makedirs(temp_parent, exist_ok=True)
+        
         # Create temp dir for conversion
-        with tempfile.TemporaryDirectory() as temp_dir:
+        with tempfile.TemporaryDirectory(dir=temp_parent) as temp_dir:
             # Step 1: Save model to temp directory
             if self.verbose:
                 print_header("Step 1/3: Saving Model", icon="💾")
                 print_info(f"Staging model to {temp_dir}...")
             
             with QuantLLMProgress() as progress:
                 task = progress.add_task("Saving model weights...", total=None)
+                save_kwargs = {
+                    "safe_serialization": True,
+                }
+                if effective_shard_size:
+                    save_kwargs["max_shard_size"] = effective_shard_size
+                
+                if smart_tensor_ordering:
+                    save_kwargs["state_dict"] = memory_optimized_tensor_order(model_to_save.state_dict())
+                
                 try:
-                    model_to_save.save_pretrained(temp_dir, safe_serialization=True)
+                    model_to_save.save_pretrained(temp_dir, **save_kwargs)
                 except Exception as e:
                     if self.verbose:
                         print_warning(f"SafeTensors save failed ({e}), using PyTorch format...")
-                    model_to_save.save_pretrained(temp_dir, safe_serialization=False)
+                    save_kwargs["safe_serialization"] = False
+                    model_to_save.save_pretrained(temp_dir, **save_kwargs)
                 
                 self.tokenizer.save_pretrained(temp_dir)
                 progress.update(task, completed=100)
diff --git a/quantllm/kernels/__init__.py b/quantllm/kernels/__init__.py
@@ -7,11 +7,17 @@
 from .triton import (
     TritonQuantizedLinear,
     fused_dequant_matmul,
+    int4_matmul,
     is_triton_available,
+    triton_q4_0_quantize,
+    triton_q8_0_quantize,
 )
 
 __all__ = [
     "TritonQuantizedLinear",
     "fused_dequant_matmul",
+    "int4_matmul",
     "is_triton_available",
+    "triton_q4_0_quantize",
+    "triton_q8_0_quantize",
 ]
diff --git a/quantllm/kernels/triton/__init__.py b/quantllm/kernels/triton/__init__.py
@@ -7,11 +7,17 @@
 from .quantized_linear import (
     TritonQuantizedLinear,
     fused_dequant_matmul,
+    int4_matmul,
     is_triton_available,
+    triton_q4_0_quantize,
+    triton_q8_0_quantize,
 )
 
 __all__ = [
     "TritonQuantizedLinear",
     "fused_dequant_matmul",
+    "int4_matmul",
     "is_triton_available",
+    "triton_q4_0_quantize",
+    "triton_q8_0_quantize",
 ]
diff --git a/quantllm/kernels/triton/quantized_linear.py b/quantllm/kernels/triton/quantized_linear.py
@@ -8,7 +8,7 @@
 Performance: ~2-3x faster than separate dequant + matmul
 """
 
-from typing import Optional, Tuple
+from typing import Callable, Dict, Optional, Tuple
 import torch
 import torch.nn as nn
 
@@ -27,6 +27,69 @@ def is_triton_available() -> bool:
     return _TRITON_AVAILABLE
 
 
+def triton_q8_0_quantize(weight: torch.Tensor, eps: float = 1e-8) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize a weight matrix to Q8_0 format (per-column symmetric int8).
+    
+    Returns:
+        qweight: int8 tensor [in_features, out_features]
+        scales: fp tensor [1, out_features]
+    """
+    if weight.dim() != 2:
+        raise ValueError(f"Q8_0 quantization expects a 2D tensor, got shape={tuple(weight.shape)}")
+    
+    max_abs = weight.abs().amax(dim=0, keepdim=True).clamp(min=eps)
+    scale = max_abs / 127.0
+    qweight = torch.clamp(torch.round(weight / scale), -128, 127).to(torch.int8)
+    return qweight, scale.to(weight.dtype)
+
+
+def triton_q4_0_quantize(weight: torch.Tensor, eps: float = 1e-8) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Quantize a weight matrix to Q4_0 format (per-column symmetric 4-bit stored in int8).
+    
+    Returns:
+        qweight: int8 tensor [in_features, out_features] with values in [-8, 7]
+        scales: fp tensor [1, out_features]
+    """
+    if weight.dim() != 2:
+        raise ValueError(f"Q4_0 quantization expects a 2D tensor, got shape={tuple(weight.shape)}")
+    
+    max_abs = weight.abs().amax(dim=0, keepdim=True).clamp(min=eps)
+    scale = max_abs / 7.0
+    qweight = torch.clamp(torch.round(weight / scale), -8, 7).to(torch.int8)
+    return qweight, scale.to(weight.dtype)
+
+
+def int4_matmul(
+    x: torch.Tensor,
+    qweight: torch.Tensor,
+    scales: torch.Tensor,
+    bias: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    INT4 matmul path backed by fused dequant+matmul on CUDA/Triton when available.
+    
+    Args:
+        x: Input [..., in_features]
+        qweight: Quantized int4 values stored in int8, shape [in_features, out_features]
+        scales: Per-column scales, shape [1, out_features] or [in_features/group, out_features]
+        bias: Optional bias [out_features]
+    """
+    # Per-column case uses [1, N] zeros; grouped quantization uses zeros shaped like scales.
+    is_per_column = scales.shape[0] == 1
+    zeros = scales.new_zeros((1, scales.shape[1])) if is_per_column else scales.new_zeros(scales.shape)
+    group_size = qweight.shape[0] if is_per_column else max(qweight.shape[0] // scales.shape[0], 1)
+    return fused_dequant_matmul(
+        x=x,
+        qweight=qweight,
+        scales=scales,
+        zeros=zeros,
+        bias=bias,
+        group_size=group_size,
+    )
+
+
 if _TRITON_AVAILABLE:
     @triton.jit
     def _fused_dequant_matmul_kernel(
@@ -462,3 +525,9 @@ def extra_repr(self) -> str:
             f'group_size={self.group_size}, '
             f'triton={self._use_triton}'
         )
+
+
+triton_quantizers: Dict[str, Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]] = {
+    "q4_0": triton_q4_0_quantize,
+    "q8_0": triton_q8_0_quantize,
+}