Skip to content

Commit 2cd4bd2

Browse files
feat: centralize export and push config defaults in turbo
Agent-Logs-Url: https://github.com/codewithdark-git/QuantLLM/sessions/aa78d528-be1d-4467-813d-711a55ade22a Co-authored-by: codewithdark-git <144595403+codewithdark-git@users.noreply.github.com>
1 parent 5bdaf11 commit 2cd4bd2

5 files changed

Lines changed: 183 additions & 43 deletions

File tree

README.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,11 +102,14 @@ model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
102102
One unified interface for everything:
103103

104104
```python
105-
model = turbo("mistralai/Mistral-7B")
105+
model = turbo(
106+
"mistralai/Mistral-7B",
107+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
108+
)
106109
model.generate("Hello!")
107110
model.finetune(data, epochs=3)
108-
model.export("gguf", quantization="Q4_K_M")
109-
model.push("user/repo", format="gguf")
111+
model.export()
112+
model.push("user/repo")
110113
```
111114

112115
### ⚡ Performance Optimizations

docs/api/turbo.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def turbo(
1414
max_length: Optional[int] = None,
1515
device: Optional[str] = None,
1616
dtype: Optional[str] = None,
17+
config: Optional[Dict[str, Any]] = None,
1718
quantize: bool = True,
1819
trust_remote_code: bool = False,
1920
verbose: bool = True,
@@ -32,6 +33,7 @@ def turbo(
3233
| `max_length` | int | auto | Maximum context length |
3334
| `device` | str | auto | Device ("cuda", "cpu", "cuda:0", "auto") |
3435
| `dtype` | str | auto | Data type ("float16", "bfloat16") |
36+
| `config` | dict | None | Shared export/push defaults (`format`, `quantization`, `push_format`, `push_quantization`) |
3537
| `quantize` | bool | True | Whether to apply quantization |
3638
| `trust_remote_code` | bool | False | Trust remote code in model |
3739
| `verbose` | bool | True | Show loading progress and stats |

docs/guide/hub-integration.md

Lines changed: 10 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -49,34 +49,18 @@ model.push("user/repo", token="hf_...")
4949
```python
5050
from quantllm import turbo
5151

52-
model = turbo("meta-llama/Llama-3.2-3B")
53-
54-
# Push as GGUF (for Ollama, llama.cpp, LM Studio)
55-
model.push(
56-
"your-username/my-model-gguf",
57-
format="gguf",
58-
quantization="Q4_K_M",
59-
license="apache-2.0"
60-
)
61-
62-
# Push as ONNX
63-
model.push(
64-
"your-username/my-model-onnx",
65-
format="onnx"
52+
model = turbo(
53+
"meta-llama/Llama-3.2-3B",
54+
config={
55+
"format": "gguf",
56+
"quantization": "Q4_K_M",
57+
"push_format": "gguf",
58+
},
6659
)
6760

68-
# Push as MLX (Apple Silicon)
69-
model.push(
70-
"your-username/my-model-mlx",
71-
format="mlx",
72-
quantization="4bit"
73-
)
74-
75-
# Push as SafeTensors (default)
76-
model.push(
77-
"your-username/my-model",
78-
format="safetensors"
79-
)
61+
# Uses shared config defaults
62+
model.export()
63+
model.push("your-username/my-model-gguf", license="apache-2.0")
8064
```
8165

8266
### Method 2: QuantLLMHubManager (Advanced)

quantllm/core/turbo_model.py

Lines changed: 52 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,12 @@
2626
from .memory import memory_optimized_tensor_order
2727

2828
DEFAULT_CHUNKED_SHARD_SIZE = "2GB"
29+
DEFAULT_EXPORT_PUSH_CONFIG = {
30+
"format": "safetensors",
31+
"push_format": "safetensors",
32+
"quantization": "Q4_K_M",
33+
"push_quantization": "Q4_K_M",
34+
}
2935

3036

3137
class TurboModel:
@@ -57,6 +63,7 @@ def __init__(
5763
model: PreTrainedModel,
5864
tokenizer: PreTrainedTokenizer,
5965
config: SmartConfig,
66+
export_push_config: Optional[Dict[str, Any]] = None,
6067
verbose: bool = False,
6168
):
6269
"""
@@ -76,6 +83,7 @@ def __init__(
7683
self._is_quantized = False
7784
self._is_finetuned = False
7885
self._lora_applied = False
86+
self.export_push_config = self._build_export_push_config(export_push_config)
7987
self.verbose = verbose
8088

8189
@classmethod
@@ -92,6 +100,7 @@ def from_pretrained(
92100
trust_remote_code: bool = True,
93101
quantize: bool = True,
94102
config_override: Optional[Dict[str, Any]] = None,
103+
config: Optional[Dict[str, Any]] = None,
95104
verbose: bool = True,
96105
) -> "TurboModel":
97106
"""
@@ -112,6 +121,7 @@ def from_pretrained(
112121
trust_remote_code: Trust remote code in model
113122
quantize: Whether to quantize the model
114123
config_override: Dict to override any auto-detected settings
124+
config: Shared export/push config (format, quantization, push_format, etc.)
115125
quantize: Whether to quantize the model
116126
config_override: Dict to override any auto-detected settings
117127
verbose: Print loading progress
@@ -268,7 +278,7 @@ def from_pretrained(
268278
print_success("Model loaded successfully!")
269279
logger.info("")
270280

271-
instance = cls(model, tokenizer, smart_config)
281+
instance = cls(model, tokenizer, smart_config, export_push_config=config)
272282
instance._is_quantized = quantize and smart_config.bits < 16
273283

274284
return instance
@@ -494,6 +504,27 @@ def _get_quantization_kwargs(config: SmartConfig) -> Dict[str, Any]:
494504
except ImportError:
495505
logger.warning("⚠ bitsandbytes not installed, loading without quantization")
496506
return {}
507+
508+
@staticmethod
509+
def _build_export_push_config(config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
510+
"""Build shared export/push config with deterministic defaults."""
511+
resolved = dict(DEFAULT_EXPORT_PUSH_CONFIG)
512+
if config:
513+
aliases = {
514+
"export_format": "format",
515+
"export_quantization": "quantization",
516+
}
517+
for key, value in config.items():
518+
mapped_key = aliases.get(key, key)
519+
if mapped_key in resolved and value is not None:
520+
resolved[mapped_key] = value
521+
522+
if "format" in config and "push_format" not in config:
523+
resolved["push_format"] = resolved["format"]
524+
if "quantization" in config and "push_quantization" not in config:
525+
resolved["push_quantization"] = resolved["quantization"]
526+
527+
return resolved
497528

498529
@staticmethod
499530
def _enable_flash_attention(model: PreTrainedModel, verbose: bool = True) -> None:
@@ -945,7 +976,7 @@ def tokenize_function(examples):
945976

946977
def export(
947978
self,
948-
format: str,
979+
format: Optional[str] = None,
949980
output_path: Optional[str] = None,
950981
*,
951982
quantization: Optional[str] = None,
@@ -961,7 +992,7 @@ def export(
961992
- "mlx": For Apple Silicon Macs
962993
963994
Args:
964-
format: Target format (gguf, safetensors, onnx, mlx)
995+
format: Target format (gguf, safetensors, onnx, mlx). Uses shared config when omitted.
965996
output_path: Output file/directory path
966997
quantization: Format-specific quantization:
967998
- GGUF: Q4_K_M, Q5_K_M, Q8_0, etc.
@@ -978,7 +1009,10 @@ def export(
9781009
>>> model.export("onnx", "./my_model_onnx/")
9791010
>>> model.export("mlx", "./my_model_mlx/", quantization="4bit")
9801011
"""
981-
format = format.lower()
1012+
format = (format or self.export_push_config["format"]).lower()
1013+
effective_quantization = quantization
1014+
if effective_quantization is None and format == "gguf":
1015+
effective_quantization = self.export_push_config["quantization"]
9821016

9831017
# Merge LoRA if applied
9841018
if self._lora_applied:
@@ -991,7 +1025,7 @@ def export(
9911025
if output_path is None:
9921026
model_name = self.model.config._name_or_path.split('/')[-1]
9931027
if format == "gguf":
994-
quant = quantization or self.config.quant_type or "q4_k_m"
1028+
quant = effective_quantization or "Q4_K_M"
9951029
output_path = f"{model_name}.{quant.upper()}.gguf"
9961030
elif format == "safetensors":
9971031
output_path = f"./{model_name}-quantllm/"
@@ -1012,7 +1046,7 @@ def export(
10121046
raise ValueError(f"Unknown format: {format}. Supported: {list(exporters.keys())}")
10131047

10141048
print_header(f"Exporting to {format.upper()}")
1015-
result = exporters[format](output_path, quantization=quantization, **kwargs)
1049+
result = exporters[format](output_path, quantization=effective_quantization, **kwargs)
10161050
print_success(f"Exported to: {result}")
10171051

10181052
return result
@@ -1021,7 +1055,7 @@ def push_to_hub(
10211055
self,
10221056
repo_id: str,
10231057
token: Optional[str] = None,
1024-
format: str = "safetensors",
1058+
format: Optional[str] = None,
10251059
quantization: Optional[str] = None,
10261060
commit_message: str = "Upload model via QuantLLM",
10271061
license: str = "apache-2.0",
@@ -1052,7 +1086,8 @@ def push_to_hub(
10521086
"""
10531087
from ..hub import QuantLLMHubManager
10541088

1055-
format_lower = format.lower()
1089+
format_lower = (format or self.export_push_config["push_format"]).lower()
1090+
push_quantization = quantization or self.export_push_config["push_quantization"]
10561091

10571092
# Get the original base model name (full path for HuggingFace link)
10581093
base_model_full = self.model.config._name_or_path
@@ -1066,7 +1101,7 @@ def push_to_hub(
10661101

10671102
if format_lower == "gguf":
10681103
# Export GGUF directly to staging
1069-
quant_label = quantization or (self.config.quant_type if self.config.quant_type != "GGUF" else "q4_k_m") or "q4_k_m"
1104+
quant_label = push_quantization or "Q4_K_M"
10701105
filename = f"{model_name}.{quant_label.upper()}.gguf"
10711106
save_path = os.path.join(manager.staging_dir, filename)
10721107

@@ -1085,11 +1120,11 @@ def push_to_hub(
10851120
print_info("Exporting to ONNX format...")
10861121
save_path = manager.staging_dir
10871122

1088-
self._export_onnx(save_path, quantization=quantization, **kwargs)
1123+
self._export_onnx(save_path, quantization=push_quantization, **kwargs)
10891124

10901125
manager.track_hyperparameters({
10911126
"format": "onnx",
1092-
"quantization": quantization,
1127+
"quantization": push_quantization,
10931128
"base_model": base_model_full,
10941129
"license": license,
10951130
})
@@ -1100,11 +1135,11 @@ def push_to_hub(
11001135
print_info("Exporting to MLX format...")
11011136
save_path = manager.staging_dir
11021137

1103-
self._export_mlx(save_path, quantization=quantization, **kwargs)
1138+
self._export_mlx(save_path, quantization=push_quantization, **kwargs)
11041139

11051140
manager.track_hyperparameters({
11061141
"format": "mlx",
1107-
"quantization": quantization,
1142+
"quantization": push_quantization,
11081143
"base_model": base_model_full,
11091144
"license": license,
11101145
})
@@ -1117,7 +1152,7 @@ def push_to_hub(
11171152
"base_model": base_model_full,
11181153
"license": license,
11191154
})
1120-
manager.save_final_model(self, format=format)
1155+
manager.save_final_model(self, format=format_lower)
11211156
manager._generate_model_card(format=format_lower)
11221157

11231158
manager.push(commit_message=commit_message)
@@ -1852,6 +1887,7 @@ def turbo(
18521887
max_length: Optional[int] = None,
18531888
device: Optional[str] = None,
18541889
dtype: Optional[str] = None,
1890+
config: Optional[Dict[str, Any]] = None,
18551891
**kwargs,
18561892
) -> TurboModel:
18571893
"""
@@ -1866,6 +1902,7 @@ def turbo(
18661902
max_length: Override max sequence length (default: auto)
18671903
device: Override device (default: best GPU)
18681904
dtype: Override dtype (default: bf16/fp16)
1905+
config: Shared export/push config (format, quantization, push_format, etc.)
18691906
**kwargs: Additional options passed to from_pretrained
18701907
18711908
Returns:
@@ -1896,5 +1933,6 @@ def turbo(
18961933
max_length=max_length,
18971934
device=device,
18981935
dtype=dtype,
1936+
config=config,
18991937
**kwargs,
19001938
)

0 commit comments

Comments
 (0)