Skip to content

Commit da272cc

Browse files
Merge pull request #23 from codewithdark-git/copilot/v2-1-gpu-acceleration-memory-optimization
Add v2.1 performance primitives: Triton Q4_0/Q8_0 + INT4 matmul and memory-aware GGUF export
2 parents 26230d0 + acfc222 commit da272cc

6 files changed

Lines changed: 165 additions & 6 deletions

File tree

docs/guide/gguf-export.md

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,30 @@ model.export("gguf", "model.gguf", quantization="Q4_K_M")
209209
For very large models:
210210

211211
```python
212+
# Note: previous `streaming=True` guidance is superseded by `chunked_conversion=True`.
213+
# If you previously used `streaming=True`, replace it with `chunked_conversion=True` (streaming has no effect here).
214+
212215
# Use lower quantization
213216
model.export("gguf", "model.Q3_K_M.gguf", quantization="Q3_K_M")
214217

215-
# Or export with streaming (reduces memory)
216-
model.export("gguf", "model.gguf", quantization="Q4_K_M", streaming=True)
218+
# Enable chunked conversion + smart ordering
219+
model.export(
220+
"gguf",
221+
"model.gguf",
222+
quantization="Q4_K_M",
223+
chunked_conversion=True,
224+
max_shard_size="2GB",
225+
smart_tensor_ordering=True,
226+
)
227+
228+
# Force intermediate files to a dedicated disk offload directory
229+
model.export(
230+
"gguf",
231+
"model.gguf",
232+
quantization="Q4_K_M",
233+
disk_offloading=True,
234+
disk_offload_dir="./quantllm_offload",
235+
)
217236
```
218237

219238
### Windows Issues

quantllm/core/memory.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import gc
1515
from typing import Optional, Dict, Any, List, Union, Callable
1616
from contextlib import contextmanager
17+
from collections import OrderedDict
1718
import torch
1819
import torch.nn as nn
1920

@@ -186,6 +187,26 @@ def estimate_model_memory(
186187
}
187188

188189

190+
def memory_optimized_tensor_order(
191+
state_dict: Dict[str, torch.Tensor],
192+
*,
193+
prioritize_large_tensors: bool = True,
194+
) -> OrderedDict[str, torch.Tensor]:
195+
"""
196+
Return an ordered state dict to reduce peak memory pressure during serialization.
197+
198+
By default, tensors are sorted by total byte size (numel * element_size),
199+
with larger tensors emitted first to reduce long-lived allocator pressure
200+
in shard-based writes on very large checkpoints.
201+
"""
202+
sorted_items = sorted(
203+
state_dict.items(),
204+
key=lambda kv: kv[1].numel() * kv[1].element_size(),
205+
reverse=prioritize_large_tensors,
206+
)
207+
return OrderedDict(sorted_items)
208+
209+
189210
class DynamicOffloader:
190211
"""
191212
Dynamic layer offloading for large models.

quantllm/core/turbo_model.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
from ..utils import logger, print_header, print_success, print_error, print_info, print_warning, QuantLLMProgress
2424
from transformers.utils.logging import disable_progress_bar as disable_hf_progress_bar
2525
from datasets.utils.logging import disable_progress_bar as disable_ds_progress_bar
26+
from .memory import memory_optimized_tensor_order
27+
28+
DEFAULT_CHUNKED_SHARD_SIZE = "2GB"
2629

2730

2831
class TurboModel:
@@ -1127,6 +1130,11 @@ def _export_gguf(
11271130
output_path: str,
11281131
quantization: Optional[str] = None,
11291132
fast_mode: bool = False,
1133+
chunked_conversion: bool = False,
1134+
max_shard_size: Optional[str] = None,
1135+
smart_tensor_ordering: bool = False,
1136+
disk_offloading: bool = False,
1137+
disk_offload_dir: Optional[str] = None,
11301138
**kwargs
11311139
) -> str:
11321140
"""
@@ -1144,13 +1152,22 @@ def _export_gguf(
11441152
output_path: Output file path for GGUF
11451153
quantization: Quantization type (Q4_K_M, Q5_K_M, Q8_0, etc.)
11461154
fast_mode: Skip intermediate F16 step for faster export (slightly less optimal)
1155+
chunked_conversion: Save model shards during conversion for large checkpoints
1156+
max_shard_size: Max shard size used when chunked conversion is active
1157+
smart_tensor_ordering: Save tensors in memory-optimized order
1158+
disk_offloading: Use a dedicated temp/offload directory for intermediate artifacts
1159+
disk_offload_dir: Directory used when disk_offloading=True
11471160
"""
11481161
from ..quant import convert_to_gguf, quantize_gguf, ensure_llama_cpp_installed, GGUF_QUANT_TYPES
11491162
from ..utils import QuantLLMProgress, format_time, format_size
11501163
import time
11511164

11521165
start_time = time.time()
11531166

1167+
effective_shard_size = max_shard_size or (
1168+
DEFAULT_CHUNKED_SHARD_SIZE if chunked_conversion else None
1169+
)
1170+
11541171
quant_type = quantization or self.config.quant_type or "q4_k_m"
11551172
quant_type_upper = quant_type.upper()
11561173
quant_type_lower = quant_type.lower()
@@ -1163,6 +1180,13 @@ def _export_gguf(
11631180
print_info(f"Target quantization: {quant_type_upper}")
11641181
if fast_mode:
11651182
print_info("Fast mode enabled")
1183+
if chunked_conversion:
1184+
print_info(f"Chunked conversion enabled (max_shard_size={effective_shard_size})")
1185+
if smart_tensor_ordering:
1186+
print_info("Smart tensor ordering enabled")
1187+
print_warning("Smart tensor ordering may temporarily materialize a full state dict in memory.")
1188+
if disk_offloading:
1189+
print_info(f"Disk offloading enabled ({disk_offload_dir or 'system temp'})")
11661190

11671191
# Ensure llama.cpp
11681192
if self.verbose:
@@ -1188,21 +1212,35 @@ def _export_gguf(
11881212
# Get model name for file naming
11891213
model_name = self.model.config._name_or_path.split('/')[-1]
11901214

1215+
temp_parent = disk_offload_dir if disk_offloading else None
1216+
if temp_parent:
1217+
os.makedirs(temp_parent, exist_ok=True)
1218+
11911219
# Create temp dir for conversion
1192-
with tempfile.TemporaryDirectory() as temp_dir:
1220+
with tempfile.TemporaryDirectory(dir=temp_parent) as temp_dir:
11931221
# Step 1: Save model to temp directory
11941222
if self.verbose:
11951223
print_header("Step 1/3: Saving Model", icon="💾")
11961224
print_info(f"Staging model to {temp_dir}...")
11971225

11981226
with QuantLLMProgress() as progress:
11991227
task = progress.add_task("Saving model weights...", total=None)
1228+
save_kwargs = {
1229+
"safe_serialization": True,
1230+
}
1231+
if effective_shard_size:
1232+
save_kwargs["max_shard_size"] = effective_shard_size
1233+
1234+
if smart_tensor_ordering:
1235+
save_kwargs["state_dict"] = memory_optimized_tensor_order(model_to_save.state_dict())
1236+
12001237
try:
1201-
model_to_save.save_pretrained(temp_dir, safe_serialization=True)
1238+
model_to_save.save_pretrained(temp_dir, **save_kwargs)
12021239
except Exception as e:
12031240
if self.verbose:
12041241
print_warning(f"SafeTensors save failed ({e}), using PyTorch format...")
1205-
model_to_save.save_pretrained(temp_dir, safe_serialization=False)
1242+
save_kwargs["safe_serialization"] = False
1243+
model_to_save.save_pretrained(temp_dir, **save_kwargs)
12061244

12071245
self.tokenizer.save_pretrained(temp_dir)
12081246
progress.update(task, completed=100)

quantllm/kernels/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,17 @@
77
from .triton import (
88
TritonQuantizedLinear,
99
fused_dequant_matmul,
10+
int4_matmul,
1011
is_triton_available,
12+
triton_q4_0_quantize,
13+
triton_q8_0_quantize,
1114
)
1215

1316
__all__ = [
1417
"TritonQuantizedLinear",
1518
"fused_dequant_matmul",
19+
"int4_matmul",
1620
"is_triton_available",
21+
"triton_q4_0_quantize",
22+
"triton_q8_0_quantize",
1723
]

quantllm/kernels/triton/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,17 @@
77
from .quantized_linear import (
88
TritonQuantizedLinear,
99
fused_dequant_matmul,
10+
int4_matmul,
1011
is_triton_available,
12+
triton_q4_0_quantize,
13+
triton_q8_0_quantize,
1114
)
1215

1316
__all__ = [
1417
"TritonQuantizedLinear",
1518
"fused_dequant_matmul",
19+
"int4_matmul",
1620
"is_triton_available",
21+
"triton_q4_0_quantize",
22+
"triton_q8_0_quantize",
1723
]

quantllm/kernels/triton/quantized_linear.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
Performance: ~2-3x faster than separate dequant + matmul
99
"""
1010

11-
from typing import Optional, Tuple
11+
from typing import Callable, Dict, Optional, Tuple
1212
import torch
1313
import torch.nn as nn
1414

@@ -27,6 +27,69 @@ def is_triton_available() -> bool:
2727
return _TRITON_AVAILABLE
2828

2929

30+
def triton_q8_0_quantize(weight: torch.Tensor, eps: float = 1e-8) -> Tuple[torch.Tensor, torch.Tensor]:
31+
"""
32+
Quantize a weight matrix to Q8_0 format (per-column symmetric int8).
33+
34+
Returns:
35+
qweight: int8 tensor [in_features, out_features]
36+
scales: fp tensor [1, out_features]
37+
"""
38+
if weight.dim() != 2:
39+
raise ValueError(f"Q8_0 quantization expects a 2D tensor, got shape={tuple(weight.shape)}")
40+
41+
max_abs = weight.abs().amax(dim=0, keepdim=True).clamp(min=eps)
42+
scale = max_abs / 127.0
43+
qweight = torch.clamp(torch.round(weight / scale), -128, 127).to(torch.int8)
44+
return qweight, scale.to(weight.dtype)
45+
46+
47+
def triton_q4_0_quantize(weight: torch.Tensor, eps: float = 1e-8) -> Tuple[torch.Tensor, torch.Tensor]:
48+
"""
49+
Quantize a weight matrix to Q4_0 format (per-column symmetric 4-bit stored in int8).
50+
51+
Returns:
52+
qweight: int8 tensor [in_features, out_features] with values in [-8, 7]
53+
scales: fp tensor [1, out_features]
54+
"""
55+
if weight.dim() != 2:
56+
raise ValueError(f"Q4_0 quantization expects a 2D tensor, got shape={tuple(weight.shape)}")
57+
58+
max_abs = weight.abs().amax(dim=0, keepdim=True).clamp(min=eps)
59+
scale = max_abs / 7.0
60+
qweight = torch.clamp(torch.round(weight / scale), -8, 7).to(torch.int8)
61+
return qweight, scale.to(weight.dtype)
62+
63+
64+
def int4_matmul(
65+
x: torch.Tensor,
66+
qweight: torch.Tensor,
67+
scales: torch.Tensor,
68+
bias: Optional[torch.Tensor] = None,
69+
) -> torch.Tensor:
70+
"""
71+
INT4 matmul path backed by fused dequant+matmul on CUDA/Triton when available.
72+
73+
Args:
74+
x: Input [..., in_features]
75+
qweight: Quantized int4 values stored in int8, shape [in_features, out_features]
76+
scales: Per-column scales, shape [1, out_features] or [in_features/group, out_features]
77+
bias: Optional bias [out_features]
78+
"""
79+
# Per-column case uses [1, N] zeros; grouped quantization uses zeros shaped like scales.
80+
is_per_column = scales.shape[0] == 1
81+
zeros = scales.new_zeros((1, scales.shape[1])) if is_per_column else scales.new_zeros(scales.shape)
82+
group_size = qweight.shape[0] if is_per_column else max(qweight.shape[0] // scales.shape[0], 1)
83+
return fused_dequant_matmul(
84+
x=x,
85+
qweight=qweight,
86+
scales=scales,
87+
zeros=zeros,
88+
bias=bias,
89+
group_size=group_size,
90+
)
91+
92+
3093
if _TRITON_AVAILABLE:
3194
@triton.jit
3295
def _fused_dequant_matmul_kernel(
@@ -462,3 +525,9 @@ def extra_repr(self) -> str:
462525
f'group_size={self.group_size}, '
463526
f'triton={self._use_triton}'
464527
)
528+
529+
530+
triton_quantizers: Dict[str, Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]] = {
531+
"q4_0": triton_q4_0_quantize,
532+
"q8_0": triton_q8_0_quantize,
533+
}

0 commit comments

Comments
 (0)