2323from ..utils import logger , print_header , print_success , print_error , print_info , print_warning , QuantLLMProgress
2424from transformers .utils .logging import disable_progress_bar as disable_hf_progress_bar
2525from datasets .utils .logging import disable_progress_bar as disable_ds_progress_bar
26+ from .memory import memory_optimized_tensor_order
27+
28+ DEFAULT_CHUNKED_SHARD_SIZE = "2GB"
2629
2730
2831class TurboModel :
@@ -1127,6 +1130,11 @@ def _export_gguf(
11271130 output_path : str ,
11281131 quantization : Optional [str ] = None ,
11291132 fast_mode : bool = False ,
1133+ chunked_conversion : bool = False ,
1134+ max_shard_size : Optional [str ] = None ,
1135+ smart_tensor_ordering : bool = False ,
1136+ disk_offloading : bool = False ,
1137+ disk_offload_dir : Optional [str ] = None ,
11301138 ** kwargs
11311139 ) -> str :
11321140 """
@@ -1156,11 +1164,9 @@ def _export_gguf(
11561164
11571165 start_time = time .time ()
11581166
1159- chunked_conversion = bool (kwargs .pop ("chunked_conversion" , False ))
1160- max_shard_size = kwargs .pop ("max_shard_size" , "2GB" if chunked_conversion else "50GB" )
1161- smart_tensor_ordering = bool (kwargs .pop ("smart_tensor_ordering" , False ))
1162- disk_offloading = bool (kwargs .pop ("disk_offloading" , False ))
1163- disk_offload_dir = kwargs .pop ("disk_offload_dir" , None )
1167+ effective_shard_size = max_shard_size or (
1168+ DEFAULT_CHUNKED_SHARD_SIZE if chunked_conversion else None
1169+ )
11641170
11651171 quant_type = quantization or self .config .quant_type or "q4_k_m"
11661172 quant_type_upper = quant_type .upper ()
@@ -1175,9 +1181,10 @@ def _export_gguf(
11751181 if fast_mode :
11761182 print_info ("Fast mode enabled" )
11771183 if chunked_conversion :
1178- print_info (f"Chunked conversion enabled (max_shard_size={ max_shard_size } )" )
1184+ print_info (f"Chunked conversion enabled (max_shard_size={ effective_shard_size } )" )
11791185 if smart_tensor_ordering :
11801186 print_info ("Smart tensor ordering enabled" )
1187+ print_warning ("Smart tensor ordering may temporarily materialize a full state dict in memory." )
11811188 if disk_offloading :
11821189 print_info (f"Disk offloading enabled ({ disk_offload_dir or 'system temp' } )" )
11831190
@@ -1220,11 +1227,11 @@ def _export_gguf(
12201227 task = progress .add_task ("Saving model weights..." , total = None )
12211228 save_kwargs = {
12221229 "safe_serialization" : True ,
1223- "max_shard_size" : max_shard_size ,
12241230 }
1231+ if effective_shard_size :
1232+ save_kwargs ["max_shard_size" ] = effective_shard_size
12251233
12261234 if smart_tensor_ordering :
1227- from .memory import memory_optimized_tensor_order
12281235 save_kwargs ["state_dict" ] = memory_optimized_tensor_order (model_to_save .state_dict ())
12291236
12301237 try :
0 commit comments