Skip to content

Commit 1975534

Browse files
feat: Update QuantLLM to v2.1 with memory optimization and training enhancements
- Added memory_optimized_tensor_order function to optimize tensor serialization order. - Updated AutoBatchSizeFinder to use the new torch.amp.autocast context manager. - Enhanced load_training_data function to combine instruction and output fields for specific text columns. - Introduced architecture registration and resolution mechanisms in TurboModel for better model loading. - Improved model saving and exporting functionalities, including support for chunked conversion and smart tensor ordering. - Added detailed logging and warnings for quantization and GGUF repository handling. - Refactored quantization checks and model state reporting for clarity and accuracy.
1 parent ac64988 commit 1975534

6 files changed

Lines changed: 836 additions & 82 deletions

File tree

quantllm/core/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from .hardware import HardwareProfiler
99
from .smart_config import SmartConfig
1010
from .model_analyzer import ModelAnalyzer
11-
from .turbo_model import TurboModel, turbo
11+
from .turbo_model import TurboModel, turbo, register_architecture
1212
from .compilation import (
1313
compile_model,
1414
compile_for_inference,
@@ -51,6 +51,7 @@
5151
"ModelAnalyzer",
5252
"TurboModel",
5353
"turbo",
54+
"register_architecture",
5455
# Compilation
5556
"compile_model",
5657
"compile_for_inference",

quantllm/core/export.py

Lines changed: 53 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Universal Export Module for QuantLLM v2.0
2+
Universal Export Module for QuantLLM v2.1
33
44
Provides unified export functionality to multiple formats:
55
- GGUF (llama.cpp, Ollama, LM Studio)
@@ -90,17 +90,48 @@ def _export_gguf(
9090
quantization: Optional[str] = None,
9191
**kwargs,
9292
) -> str:
93-
"""Export to GGUF format."""
94-
from ..quant import convert_to_gguf
93+
"""Export to GGUF format.
94+
95+
Saves the model to a temp directory, then uses convert_to_gguf
96+
and quantize_gguf to produce the final GGUF file.
97+
"""
98+
from ..quant import convert_to_gguf, quantize_gguf, ensure_llama_cpp_installed
99+
import tempfile
95100

96101
quant_type = quantization or "Q4_K_M"
97102

98-
return convert_to_gguf(
99-
self.model,
100-
output_path,
101-
quant_type=quant_type,
102-
**kwargs,
103-
)
103+
ensure_llama_cpp_installed()
104+
105+
model_name = getattr(self.model.config, '_name_or_path', 'model').split('/')[-1]
106+
107+
with tempfile.TemporaryDirectory() as temp_dir:
108+
# Save model to temp
109+
self.model.save_pretrained(temp_dir, safe_serialization=True)
110+
if self.tokenizer:
111+
self.tokenizer.save_pretrained(temp_dir)
112+
113+
# Convert to F16 GGUF
114+
f16_file = os.path.join(temp_dir, f"{model_name}.F16.gguf")
115+
output_files, _ = convert_to_gguf(
116+
model_name=model_name,
117+
input_folder=temp_dir,
118+
model_dtype="f16",
119+
quantization_type="f16",
120+
print_output=False,
121+
)
122+
123+
if output_files:
124+
f16_file = output_files[0]
125+
126+
# Quantize to target
127+
quantize_gguf(
128+
input_gguf=f16_file,
129+
output_gguf=output_path,
130+
quant_type=quant_type,
131+
print_output=False,
132+
)
133+
134+
return output_path
104135

105136
def _export_safetensors(
106137
self,
@@ -220,7 +251,6 @@ def _export_mlx(
220251
quantization: MLX quantization (4bit, 8bit)
221252
"""
222253
try:
223-
import mlx.core as mx
224254
from mlx_lm import convert
225255
except ImportError:
226256
raise ImportError(
@@ -238,13 +268,20 @@ def _export_mlx(
238268
# Convert to MLX
239269
mlx_path = os.path.join(output_path, "mlx_model")
240270

241-
# Use mlx-lm convert
242-
convert_args = [temp_hf_path, "--mlx-path", mlx_path]
271+
# Build convert kwargs
272+
convert_kwargs = {
273+
"hf_path": temp_hf_path,
274+
"mlx_path": mlx_path,
275+
}
243276
if quantization:
244-
if quantization == "4bit":
245-
convert_args.extend(["-q", "--q-bits", "4"])
246-
elif quantization == "8bit":
247-
convert_args.extend(["-q", "--q-bits", "8"])
277+
if quantization in ("4bit", "4"):
278+
convert_kwargs["quantize"] = True
279+
convert_kwargs["q_bits"] = 4
280+
elif quantization in ("8bit", "8"):
281+
convert_kwargs["quantize"] = True
282+
convert_kwargs["q_bits"] = 8
283+
284+
convert(**convert_kwargs)
248285

249286
# Clean up temp
250287
shutil.rmtree(temp_hf_path, ignore_errors=True)

quantllm/core/flash_attention.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,11 @@
1616

1717
try:
1818
from flash_attn import flash_attn_func, flash_attn_varlen_func
19-
from flash_attn.flash_attn_interface import flash_attn_cuda
2019
_FLASH_ATTN_AVAILABLE = True
2120
try:
2221
import flash_attn
2322
_FLASH_ATTN_VERSION = getattr(flash_attn, '__version__', '2.0.0')
24-
except:
23+
except Exception:
2524
_FLASH_ATTN_VERSION = '2.0.0'
2625
except ImportError:
2726
pass

quantllm/core/memory.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Memory Optimization Utilities for QuantLLM v2.0
2+
Memory Optimization Utilities for QuantLLM v2.1
33
44
Advanced memory management for training and inference of large models
55
on limited GPU memory.
@@ -14,6 +14,7 @@
1414
import gc
1515
from typing import Optional, Dict, Any, List, Union, Callable
1616
from contextlib import contextmanager
17+
from collections import OrderedDict
1718
import torch
1819
import torch.nn as nn
1920

@@ -186,6 +187,26 @@ def estimate_model_memory(
186187
}
187188

188189

190+
def memory_optimized_tensor_order(
191+
state_dict: Dict[str, torch.Tensor],
192+
*,
193+
prioritize_large_tensors: bool = True,
194+
) -> OrderedDict[str, torch.Tensor]:
195+
"""
196+
Return an ordered state dict to reduce peak memory pressure during serialization.
197+
198+
By default, tensors are sorted by total byte size (numel * element_size),
199+
with larger tensors emitted first to reduce long-lived allocator pressure
200+
in shard-based writes on very large checkpoints.
201+
"""
202+
sorted_items = sorted(
203+
state_dict.items(),
204+
key=lambda kv: kv[1].numel() * kv[1].element_size(),
205+
reverse=prioritize_large_tensors,
206+
)
207+
return OrderedDict(sorted_items)
208+
209+
189210
class DynamicOffloader:
190211
"""
191212
Dynamic layer offloading for large models.

quantllm/core/training.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Advanced Training Utilities for QuantLLM v2.0
2+
Advanced Training Utilities for QuantLLM v2.1
33
44
Provides auto-configuration and optimization for fine-tuning
55
with minimal user input.
@@ -154,7 +154,7 @@ def _test_batch_size(self, batch_size: int, training: bool) -> bool:
154154

155155
if training:
156156
self.model.train()
157-
with torch.cuda.amp.autocast():
157+
with torch.amp.autocast('cuda'):
158158
outputs = self.model(dummy_input, labels=dummy_input)
159159
loss = outputs.loss
160160
loss.backward()
@@ -163,7 +163,7 @@ def _test_batch_size(self, batch_size: int, training: bool) -> bool:
163163
else:
164164
self.model.eval()
165165
with torch.inference_mode():
166-
with torch.cuda.amp.autocast():
166+
with torch.amp.autocast('cuda'):
167167
self.model(dummy_input)
168168

169169
del dummy_input
@@ -512,7 +512,14 @@ def load_training_data(
512512

513513
# Tokenize
514514
def tokenize_fn(examples):
515-
texts = examples[text_column]
515+
if text_column == '__instruction_output__':
516+
# Combine instruction and output fields
517+
texts = [
518+
f"### Instruction:\n{inst}\n\n### Response:\n{out}"
519+
for inst, out in zip(examples['instruction'], examples['output'])
520+
]
521+
else:
522+
texts = examples[text_column]
516523
result = tokenizer(
517524
texts,
518525
truncation=True,

0 commit comments

Comments
 (0)