Skip to content

Commit acfc222

Browse files
refactor: polish roadmap APIs and memory export defaults
Agent-Logs-Url: https://github.com/codewithdark-git/QuantLLM/sessions/fc1f2077-187e-4757-9927-2c60ef187666 Co-authored-by: codewithdark-git <144595403+codewithdark-git@users.noreply.github.com>
1 parent 52429d6 commit acfc222

4 files changed

Lines changed: 28 additions & 16 deletions

File tree

docs/guide/gguf-export.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,9 @@ model.export("gguf", "model.gguf", quantization="Q4_K_M")
209209
For very large models:
210210

211211
```python
212+
# Note: previous `streaming=True` guidance is superseded by `chunked_conversion=True`.
213+
# If you previously used `streaming=True`, replace it with `chunked_conversion=True` (streaming has no effect here).
214+
212215
# Use lower quantization
213216
model.export("gguf", "model.Q3_K_M.gguf", quantization="Q3_K_M")
214217

quantllm/core/memory.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
"""
1313

1414
import gc
15-
from typing import Optional, Dict, Any, List, Union, Callable, Iterable
15+
from typing import Optional, Dict, Any, List, Union, Callable
1616
from contextlib import contextmanager
1717
from collections import OrderedDict
1818
import torch
@@ -191,16 +191,16 @@ def memory_optimized_tensor_order(
191191
state_dict: Dict[str, torch.Tensor],
192192
*,
193193
prioritize_large_tensors: bool = True,
194-
) -> "OrderedDict[str, torch.Tensor]":
194+
) -> OrderedDict[str, torch.Tensor]:
195195
"""
196196
Return an ordered state dict to reduce peak memory pressure during serialization.
197197
198-
By default, larger tensors are emitted first to reduce long-lived allocator pressure
198+
By default, tensors are sorted by total byte size (numel * element_size),
199+
with larger tensors emitted first to reduce long-lived allocator pressure
199200
in shard-based writes on very large checkpoints.
200201
"""
201-
items: Iterable = state_dict.items()
202202
sorted_items = sorted(
203-
items,
203+
state_dict.items(),
204204
key=lambda kv: kv[1].numel() * kv[1].element_size(),
205205
reverse=prioritize_large_tensors,
206206
)

quantllm/core/turbo_model.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
from ..utils import logger, print_header, print_success, print_error, print_info, print_warning, QuantLLMProgress
2424
from transformers.utils.logging import disable_progress_bar as disable_hf_progress_bar
2525
from datasets.utils.logging import disable_progress_bar as disable_ds_progress_bar
26+
from .memory import memory_optimized_tensor_order
27+
28+
DEFAULT_CHUNKED_SHARD_SIZE = "2GB"
2629

2730

2831
class TurboModel:
@@ -1127,6 +1130,11 @@ def _export_gguf(
11271130
output_path: str,
11281131
quantization: Optional[str] = None,
11291132
fast_mode: bool = False,
1133+
chunked_conversion: bool = False,
1134+
max_shard_size: Optional[str] = None,
1135+
smart_tensor_ordering: bool = False,
1136+
disk_offloading: bool = False,
1137+
disk_offload_dir: Optional[str] = None,
11301138
**kwargs
11311139
) -> str:
11321140
"""
@@ -1156,11 +1164,9 @@ def _export_gguf(
11561164

11571165
start_time = time.time()
11581166

1159-
chunked_conversion = bool(kwargs.pop("chunked_conversion", False))
1160-
max_shard_size = kwargs.pop("max_shard_size", "2GB" if chunked_conversion else "50GB")
1161-
smart_tensor_ordering = bool(kwargs.pop("smart_tensor_ordering", False))
1162-
disk_offloading = bool(kwargs.pop("disk_offloading", False))
1163-
disk_offload_dir = kwargs.pop("disk_offload_dir", None)
1167+
effective_shard_size = max_shard_size or (
1168+
DEFAULT_CHUNKED_SHARD_SIZE if chunked_conversion else None
1169+
)
11641170

11651171
quant_type = quantization or self.config.quant_type or "q4_k_m"
11661172
quant_type_upper = quant_type.upper()
@@ -1175,9 +1181,10 @@ def _export_gguf(
11751181
if fast_mode:
11761182
print_info("Fast mode enabled")
11771183
if chunked_conversion:
1178-
print_info(f"Chunked conversion enabled (max_shard_size={max_shard_size})")
1184+
print_info(f"Chunked conversion enabled (max_shard_size={effective_shard_size})")
11791185
if smart_tensor_ordering:
11801186
print_info("Smart tensor ordering enabled")
1187+
print_warning("Smart tensor ordering may temporarily materialize a full state dict in memory.")
11811188
if disk_offloading:
11821189
print_info(f"Disk offloading enabled ({disk_offload_dir or 'system temp'})")
11831190

@@ -1220,11 +1227,11 @@ def _export_gguf(
12201227
task = progress.add_task("Saving model weights...", total=None)
12211228
save_kwargs = {
12221229
"safe_serialization": True,
1223-
"max_shard_size": max_shard_size,
12241230
}
1231+
if effective_shard_size:
1232+
save_kwargs["max_shard_size"] = effective_shard_size
12251233

12261234
if smart_tensor_ordering:
1227-
from .memory import memory_optimized_tensor_order
12281235
save_kwargs["state_dict"] = memory_optimized_tensor_order(model_to_save.state_dict())
12291236

12301237
try:

quantllm/kernels/triton/quantized_linear.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,10 @@ def int4_matmul(
7676
scales: Per-column scales, shape [1, out_features] or [in_features/group, out_features]
7777
bias: Optional bias [out_features]
7878
"""
79-
zeros = torch.zeros_like(scales)
80-
group_size = qweight.shape[0] if scales.shape[0] == 1 else max(qweight.shape[0] // scales.shape[0], 1)
79+
# Per-column case uses [1, N] zeros; grouped quantization uses zeros shaped like scales.
80+
is_per_column = scales.shape[0] == 1
81+
zeros = scales.new_zeros((1, scales.shape[1])) if is_per_column else scales.new_zeros(scales.shape)
82+
group_size = qweight.shape[0] if is_per_column else max(qweight.shape[0] // scales.shape[0], 1)
8183
return fused_dequant_matmul(
8284
x=x,
8385
qweight=qweight,
@@ -525,7 +527,7 @@ def extra_repr(self) -> str:
525527
)
526528

527529

528-
TRITON_QUANT_KERNELS: Dict[str, Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]] = {
530+
triton_quantizers: Dict[str, Callable[[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]] = {
529531
"q4_0": triton_q4_0_quantize,
530532
"q8_0": triton_q8_0_quantize,
531533
}

0 commit comments

Comments
 (0)