feat: Update QuantLLM to v2.1 with memory optimization and training enhancements

codewithdark-git · codewithdark-git · commit 19755344292a · 2026-04-29T18:12:48.000+05:00
- Added memory_optimized_tensor_order function to optimize tensor serialization order.
- Updated AutoBatchSizeFinder to use the new torch.amp.autocast context manager.
- Enhanced load_training_data function to combine instruction and output fields for specific text columns.
- Introduced architecture registration and resolution mechanisms in TurboModel for better model loading.
- Improved model saving and exporting functionalities, including support for chunked conversion and smart tensor ordering.
- Added detailed logging and warnings for quantization and GGUF repository handling.
- Refactored quantization checks and model state reporting for clarity and accuracy.
diff --git a/quantllm/core/__init__.py b/quantllm/core/__init__.py
@@ -8,7 +8,7 @@
 from .hardware import HardwareProfiler
 from .smart_config import SmartConfig
 from .model_analyzer import ModelAnalyzer
-from .turbo_model import TurboModel, turbo
+from .turbo_model import TurboModel, turbo, register_architecture
 from .compilation import (
     compile_model,
     compile_for_inference,
@@ -51,6 +51,7 @@
     "ModelAnalyzer",
     "TurboModel",
     "turbo",
+    "register_architecture",
     # Compilation
     "compile_model",
     "compile_for_inference",
diff --git a/quantllm/core/export.py b/quantllm/core/export.py
@@ -1,5 +1,5 @@
 """
-Universal Export Module for QuantLLM v2.0
+Universal Export Module for QuantLLM v2.1
 
 Provides unified export functionality to multiple formats:
 - GGUF (llama.cpp, Ollama, LM Studio)
@@ -90,17 +90,48 @@ def _export_gguf(
         quantization: Optional[str] = None,
         **kwargs,
     ) -> str:
-        """Export to GGUF format."""
-        from ..quant import convert_to_gguf
+        """Export to GGUF format.
+        
+        Saves the model to a temp directory, then uses convert_to_gguf
+        and quantize_gguf to produce the final GGUF file.
+        """
+        from ..quant import convert_to_gguf, quantize_gguf, ensure_llama_cpp_installed
+        import tempfile
         
         quant_type = quantization or "Q4_K_M"
         
-        return convert_to_gguf(
-            self.model,
-            output_path,
-            quant_type=quant_type,
-            **kwargs,
-        )
+        ensure_llama_cpp_installed()
+        
+        model_name = getattr(self.model.config, '_name_or_path', 'model').split('/')[-1]
+        
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save model to temp
+            self.model.save_pretrained(temp_dir, safe_serialization=True)
+            if self.tokenizer:
+                self.tokenizer.save_pretrained(temp_dir)
+            
+            # Convert to F16 GGUF
+            f16_file = os.path.join(temp_dir, f"{model_name}.F16.gguf")
+            output_files, _ = convert_to_gguf(
+                model_name=model_name,
+                input_folder=temp_dir,
+                model_dtype="f16",
+                quantization_type="f16",
+                print_output=False,
+            )
+            
+            if output_files:
+                f16_file = output_files[0]
+            
+            # Quantize to target
+            quantize_gguf(
+                input_gguf=f16_file,
+                output_gguf=output_path,
+                quant_type=quant_type,
+                print_output=False,
+            )
+        
+        return output_path
     
     def _export_safetensors(
         self,
@@ -220,7 +251,6 @@ def _export_mlx(
             quantization: MLX quantization (4bit, 8bit)
         """
         try:
-            import mlx.core as mx
             from mlx_lm import convert
         except ImportError:
             raise ImportError(
@@ -238,13 +268,20 @@ def _export_mlx(
         # Convert to MLX
         mlx_path = os.path.join(output_path, "mlx_model")
         
-        # Use mlx-lm convert
-        convert_args = [temp_hf_path, "--mlx-path", mlx_path]
+        # Build convert kwargs
+        convert_kwargs = {
+            "hf_path": temp_hf_path,
+            "mlx_path": mlx_path,
+        }
         if quantization:
-            if quantization == "4bit":
-                convert_args.extend(["-q", "--q-bits", "4"])
-            elif quantization == "8bit":
-                convert_args.extend(["-q", "--q-bits", "8"])
+            if quantization in ("4bit", "4"):
+                convert_kwargs["quantize"] = True
+                convert_kwargs["q_bits"] = 4
+            elif quantization in ("8bit", "8"):
+                convert_kwargs["quantize"] = True
+                convert_kwargs["q_bits"] = 8
+        
+        convert(**convert_kwargs)
         
         # Clean up temp
         shutil.rmtree(temp_hf_path, ignore_errors=True)
diff --git a/quantllm/core/flash_attention.py b/quantllm/core/flash_attention.py
@@ -16,12 +16,11 @@
 
 try:
     from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.flash_attn_interface import flash_attn_cuda
     _FLASH_ATTN_AVAILABLE = True
     try:
         import flash_attn
         _FLASH_ATTN_VERSION = getattr(flash_attn, '__version__', '2.0.0')
-    except:
+    except Exception:
         _FLASH_ATTN_VERSION = '2.0.0'
 except ImportError:
     pass
diff --git a/quantllm/core/memory.py b/quantllm/core/memory.py
@@ -1,5 +1,5 @@
 """
-Memory Optimization Utilities for QuantLLM v2.0
+Memory Optimization Utilities for QuantLLM v2.1
 
 Advanced memory management for training and inference of large models
 on limited GPU memory.
@@ -14,6 +14,7 @@
 import gc
 from typing import Optional, Dict, Any, List, Union, Callable
 from contextlib import contextmanager
+from collections import OrderedDict
 import torch
 import torch.nn as nn
 
@@ -186,6 +187,26 @@ def estimate_model_memory(
         }
 
 
+def memory_optimized_tensor_order(
+    state_dict: Dict[str, torch.Tensor],
+    *,
+    prioritize_large_tensors: bool = True,
+) -> OrderedDict[str, torch.Tensor]:
+    """
+    Return an ordered state dict to reduce peak memory pressure during serialization.
+    
+    By default, tensors are sorted by total byte size (numel * element_size),
+    with larger tensors emitted first to reduce long-lived allocator pressure
+    in shard-based writes on very large checkpoints.
+    """
+    sorted_items = sorted(
+        state_dict.items(),
+        key=lambda kv: kv[1].numel() * kv[1].element_size(),
+        reverse=prioritize_large_tensors,
+    )
+    return OrderedDict(sorted_items)
+
+
 class DynamicOffloader:
     """
     Dynamic layer offloading for large models.
diff --git a/quantllm/core/training.py b/quantllm/core/training.py
@@ -1,5 +1,5 @@
 """
-Advanced Training Utilities for QuantLLM v2.0
+Advanced Training Utilities for QuantLLM v2.1
 
 Provides auto-configuration and optimization for fine-tuning
 with minimal user input.
@@ -154,7 +154,7 @@ def _test_batch_size(self, batch_size: int, training: bool) -> bool:
             
             if training:
                 self.model.train()
-                with torch.cuda.amp.autocast():
+                with torch.amp.autocast('cuda'):
                     outputs = self.model(dummy_input, labels=dummy_input)
                     loss = outputs.loss
                     loss.backward()
@@ -163,7 +163,7 @@ def _test_batch_size(self, batch_size: int, training: bool) -> bool:
             else:
                 self.model.eval()
                 with torch.inference_mode():
-                    with torch.cuda.amp.autocast():
+                    with torch.amp.autocast('cuda'):
                         self.model(dummy_input)
             
             del dummy_input
@@ -512,7 +512,14 @@ def load_training_data(
     
     # Tokenize
     def tokenize_fn(examples):
-        texts = examples[text_column]
+        if text_column == '__instruction_output__':
+            # Combine instruction and output fields
+            texts = [
+                f"### Instruction:\n{inst}\n\n### Response:\n{out}"
+                for inst, out in zip(examples['instruction'], examples['output'])
+            ]
+        else:
+            texts = examples[text_column]
         result = tokenizer(
             texts,
             truncation=True,
diff --git a/quantllm/core/turbo_model.py b/quantllm/core/turbo_model.py