2323from ..utils import logger , print_header , print_success , print_error , print_info , print_warning , QuantLLMProgress
2424from transformers .utils .logging import disable_progress_bar as disable_hf_progress_bar
2525from datasets .utils .logging import disable_progress_bar as disable_ds_progress_bar
26+ from .memory import memory_optimized_tensor_order
27+
28+ DEFAULT_CHUNKED_SHARD_SIZE = "2GB"
2629
2730
2831class TurboModel :
@@ -1127,6 +1130,11 @@ def _export_gguf(
11271130 output_path : str ,
11281131 quantization : Optional [str ] = None ,
11291132 fast_mode : bool = False ,
1133+ chunked_conversion : bool = False ,
1134+ max_shard_size : Optional [str ] = None ,
1135+ smart_tensor_ordering : bool = False ,
1136+ disk_offloading : bool = False ,
1137+ disk_offload_dir : Optional [str ] = None ,
11301138 ** kwargs
11311139 ) -> str :
11321140 """
@@ -1144,13 +1152,22 @@ def _export_gguf(
11441152 output_path: Output file path for GGUF
11451153 quantization: Quantization type (Q4_K_M, Q5_K_M, Q8_0, etc.)
11461154 fast_mode: Skip intermediate F16 step for faster export (slightly less optimal)
1155+ chunked_conversion: Save model shards during conversion for large checkpoints
1156+ max_shard_size: Max shard size used when chunked conversion is active
1157+ smart_tensor_ordering: Save tensors in memory-optimized order
1158+ disk_offloading: Use a dedicated temp/offload directory for intermediate artifacts
1159+ disk_offload_dir: Directory used when disk_offloading=True
11471160 """
11481161 from ..quant import convert_to_gguf , quantize_gguf , ensure_llama_cpp_installed , GGUF_QUANT_TYPES
11491162 from ..utils import QuantLLMProgress , format_time , format_size
11501163 import time
11511164
11521165 start_time = time .time ()
11531166
1167+ effective_shard_size = max_shard_size or (
1168+ DEFAULT_CHUNKED_SHARD_SIZE if chunked_conversion else None
1169+ )
1170+
11541171 quant_type = quantization or self .config .quant_type or "q4_k_m"
11551172 quant_type_upper = quant_type .upper ()
11561173 quant_type_lower = quant_type .lower ()
@@ -1163,6 +1180,13 @@ def _export_gguf(
11631180 print_info (f"Target quantization: { quant_type_upper } " )
11641181 if fast_mode :
11651182 print_info ("Fast mode enabled" )
1183+ if chunked_conversion :
1184+ print_info (f"Chunked conversion enabled (max_shard_size={ effective_shard_size } )" )
1185+ if smart_tensor_ordering :
1186+ print_info ("Smart tensor ordering enabled" )
1187+ print_warning ("Smart tensor ordering may temporarily materialize a full state dict in memory." )
1188+ if disk_offloading :
1189+ print_info (f"Disk offloading enabled ({ disk_offload_dir or 'system temp' } )" )
11661190
11671191 # Ensure llama.cpp
11681192 if self .verbose :
@@ -1188,21 +1212,35 @@ def _export_gguf(
11881212 # Get model name for file naming
11891213 model_name = self .model .config ._name_or_path .split ('/' )[- 1 ]
11901214
1215+ temp_parent = disk_offload_dir if disk_offloading else None
1216+ if temp_parent :
1217+ os .makedirs (temp_parent , exist_ok = True )
1218+
11911219 # Create temp dir for conversion
1192- with tempfile .TemporaryDirectory () as temp_dir :
1220+ with tempfile .TemporaryDirectory (dir = temp_parent ) as temp_dir :
11931221 # Step 1: Save model to temp directory
11941222 if self .verbose :
11951223 print_header ("Step 1/3: Saving Model" , icon = "💾" )
11961224 print_info (f"Staging model to { temp_dir } ..." )
11971225
11981226 with QuantLLMProgress () as progress :
11991227 task = progress .add_task ("Saving model weights..." , total = None )
1228+ save_kwargs = {
1229+ "safe_serialization" : True ,
1230+ }
1231+ if effective_shard_size :
1232+ save_kwargs ["max_shard_size" ] = effective_shard_size
1233+
1234+ if smart_tensor_ordering :
1235+ save_kwargs ["state_dict" ] = memory_optimized_tensor_order (model_to_save .state_dict ())
1236+
12001237 try :
1201- model_to_save .save_pretrained (temp_dir , safe_serialization = True )
1238+ model_to_save .save_pretrained (temp_dir , ** save_kwargs )
12021239 except Exception as e :
12031240 if self .verbose :
12041241 print_warning (f"SafeTensors save failed ({ e } ), using PyTorch format..." )
1205- model_to_save .save_pretrained (temp_dir , safe_serialization = False )
1242+ save_kwargs ["safe_serialization" ] = False
1243+ model_to_save .save_pretrained (temp_dir , ** save_kwargs )
12061244
12071245 self .tokenizer .save_pretrained (temp_dir )
12081246 progress .update (task , completed = 100 )
0 commit comments