@@ -1144,13 +1144,24 @@ def _export_gguf(
11441144 output_path: Output file path for GGUF
11451145 quantization: Quantization type (Q4_K_M, Q5_K_M, Q8_0, etc.)
11461146 fast_mode: Skip intermediate F16 step for faster export (slightly less optimal)
1147+ chunked_conversion: Save model shards during conversion for large checkpoints
1148+ max_shard_size: Max shard size used when chunked conversion is active
1149+ smart_tensor_ordering: Save tensors in memory-optimized order
1150+ disk_offloading: Use a dedicated temp/offload directory for intermediate artifacts
1151+ disk_offload_dir: Directory used when disk_offloading=True
11471152 """
11481153 from ..quant import convert_to_gguf , quantize_gguf , ensure_llama_cpp_installed , GGUF_QUANT_TYPES
11491154 from ..utils import QuantLLMProgress , format_time , format_size
11501155 import time
11511156
11521157 start_time = time .time ()
11531158
1159+ chunked_conversion = bool (kwargs .pop ("chunked_conversion" , False ))
1160+ max_shard_size = kwargs .pop ("max_shard_size" , "2GB" if chunked_conversion else "50GB" )
1161+ smart_tensor_ordering = bool (kwargs .pop ("smart_tensor_ordering" , False ))
1162+ disk_offloading = bool (kwargs .pop ("disk_offloading" , False ))
1163+ disk_offload_dir = kwargs .pop ("disk_offload_dir" , None )
1164+
11541165 quant_type = quantization or self .config .quant_type or "q4_k_m"
11551166 quant_type_upper = quant_type .upper ()
11561167 quant_type_lower = quant_type .lower ()
@@ -1163,6 +1174,12 @@ def _export_gguf(
11631174 print_info (f"Target quantization: { quant_type_upper } " )
11641175 if fast_mode :
11651176 print_info ("Fast mode enabled" )
1177+ if chunked_conversion :
1178+ print_info (f"Chunked conversion enabled (max_shard_size={ max_shard_size } )" )
1179+ if smart_tensor_ordering :
1180+ print_info ("Smart tensor ordering enabled" )
1181+ if disk_offloading :
1182+ print_info (f"Disk offloading enabled ({ disk_offload_dir or 'system temp' } )" )
11661183
11671184 # Ensure llama.cpp
11681185 if self .verbose :
@@ -1188,21 +1205,35 @@ def _export_gguf(
11881205 # Get model name for file naming
11891206 model_name = self .model .config ._name_or_path .split ('/' )[- 1 ]
11901207
1208+ temp_parent = disk_offload_dir if disk_offloading else None
1209+ if temp_parent :
1210+ os .makedirs (temp_parent , exist_ok = True )
1211+
11911212 # Create temp dir for conversion
1192- with tempfile .TemporaryDirectory () as temp_dir :
1213+ with tempfile .TemporaryDirectory (dir = temp_parent ) as temp_dir :
11931214 # Step 1: Save model to temp directory
11941215 if self .verbose :
11951216 print_header ("Step 1/3: Saving Model" , icon = "💾" )
11961217 print_info (f"Staging model to { temp_dir } ..." )
11971218
11981219 with QuantLLMProgress () as progress :
11991220 task = progress .add_task ("Saving model weights..." , total = None )
1221+ save_kwargs = {
1222+ "safe_serialization" : True ,
1223+ "max_shard_size" : max_shard_size ,
1224+ }
1225+
1226+ if smart_tensor_ordering :
1227+ from .memory import memory_optimized_tensor_order
1228+ save_kwargs ["state_dict" ] = memory_optimized_tensor_order (model_to_save .state_dict ())
1229+
12001230 try :
1201- model_to_save .save_pretrained (temp_dir , safe_serialization = True )
1231+ model_to_save .save_pretrained (temp_dir , ** save_kwargs )
12021232 except Exception as e :
12031233 if self .verbose :
12041234 print_warning (f"SafeTensors save failed ({ e } ), using PyTorch format..." )
1205- model_to_save .save_pretrained (temp_dir , safe_serialization = False )
1235+ save_kwargs ["safe_serialization" ] = False
1236+ model_to_save .save_pretrained (temp_dir , ** save_kwargs )
12061237
12071238 self .tokenizer .save_pretrained (temp_dir )
12081239 progress .update (task , completed = 100 )
0 commit comments