Skip to content

Commit 4828488

Browse files
Merge pull request #25 from codewithdark-git/copilot/centralize-config-export-push-parameters
Centralize Turbo export/push config, make quantization defaults deterministic, and prepare v2.1 pre-release
2 parents da272cc + 1c63330 commit 4828488

26 files changed

Lines changed: 350 additions & 116 deletions

README.md

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<div align="center">
22
<img src="docs/images/1.png" alt="QuantLLM Logo" />
33

4-
# 🚀 QuantLLM v2.0
4+
# 🚀 QuantLLM v2.1 (pre-release)
55

66
**The Ultra-Fast LLM Quantization & Export Library**
77

@@ -52,9 +52,12 @@ model = AutoModelForCausalLM.from_pretrained(
5252
```python
5353
from quantllm import turbo
5454

55-
model = turbo("meta-llama/Llama-3-8B") # Auto-quantizes
55+
model = turbo(
56+
"meta-llama/Llama-3-8B",
57+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
58+
) # Auto-quantizes
5659
model.generate("Hello!") # Generate text
57-
model.export("gguf", quantization="Q4_K_M") # Export to GGUF
60+
model.export() # Export to GGUF with shared config
5861
```
5962

6063
---
@@ -77,14 +80,17 @@ pip install "quantllm[full] @ git+https://github.com/codewithdark-git/QuantLLM.g
7780
from quantllm import turbo
7881

7982
# Load with automatic optimization
80-
model = turbo("meta-llama/Llama-3.2-3B")
83+
model = turbo(
84+
"meta-llama/Llama-3.2-3B",
85+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
86+
)
8187

8288
# Generate text
8389
response = model.generate("Explain quantum computing simply")
8490
print(response)
8591

8692
# Export to GGUF
87-
model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
93+
model.export("gguf", "model.Q4_K_M.gguf")
8894
```
8995

9096
**QuantLLM automatically:**
@@ -102,11 +108,14 @@ model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
102108
One unified interface for everything:
103109

104110
```python
105-
model = turbo("mistralai/Mistral-7B")
111+
model = turbo(
112+
"mistralai/Mistral-7B",
113+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
114+
)
106115
model.generate("Hello!")
107116
model.finetune(data, epochs=3)
108-
model.export("gguf", quantization="Q4_K_M")
109-
model.push("user/repo", format="gguf")
117+
model.export()
118+
model.push("user/repo")
110119
```
111120

112121
### ⚡ Performance Optimizations
@@ -133,7 +142,7 @@ Llama 2/3, Mistral, Mixtral, Qwen 1/2, Phi 1/2/3, Gemma, Falcon, DeepSeek, Yi, S
133142

134143
```
135144
╔════════════════════════════════════════════════════════════╗
136-
║ 🚀 QuantLLM v2.0.0
145+
║ 🚀 QuantLLM v2.1.0rc1
137146
║ Ultra-fast LLM Quantization & Export ║
138147
║ ✓ GGUF ✓ ONNX ✓ MLX ✓ SafeTensors ║
139148
╚════════════════════════════════════════════════════════════╝
@@ -148,7 +157,7 @@ Llama 2/3, Mistral, Mixtral, Qwen 1/2, Phi 1/2/3, Gemma, Falcon, DeepSeek, Yi, S
148157
Auto-generates model cards with YAML frontmatter, usage examples, and "Use this model" button:
149158

150159
```python
151-
model.push("user/my-model", format="gguf", quantization="Q4_K_M")
160+
model.push("user/my-model")
152161
```
153162

154163
---
@@ -195,7 +204,10 @@ model.export("safetensors", "./model-hf/")
195204
```python
196205
from quantllm import turbo
197206

198-
model = turbo("meta-llama/Llama-3.2-3B")
207+
model = turbo(
208+
"meta-llama/Llama-3.2-3B",
209+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
210+
)
199211

200212
# Simple generation
201213
response = model.generate(
@@ -267,8 +279,6 @@ model = turbo("meta-llama/Llama-3.2-3B")
267279
# Push with auto-generated model card
268280
model.push(
269281
"your-username/my-model",
270-
format="gguf",
271-
quantization="Q4_K_M",
272282
license="apache-2.0"
273283
)
274284
```

docs/api/gguf.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,11 @@ Export models to GGUF format for llama.cpp, Ollama, and LM Studio.
1010
from quantllm import turbo, convert_to_gguf, quantize_gguf
1111

1212
# Method 1: Via TurboModel
13-
model = turbo("meta-llama/Llama-3.2-3B")
14-
model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
13+
model = turbo(
14+
"meta-llama/Llama-3.2-3B",
15+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
16+
)
17+
model.export("gguf", "model.Q4_K_M.gguf")
1518

1619
# Method 2: Direct conversion
1720
convert_to_gguf("meta-llama/Llama-3.2-3B", "model.Q4_K_M.gguf", quant_type="Q4_K_M")

docs/api/hub.md

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,11 @@ Push models to HuggingFace Hub with auto-generated model cards.
1010
from quantllm import turbo, QuantLLMHubManager
1111

1212
# Method 1: TurboModel.push() (Recommended)
13-
model = turbo("meta-llama/Llama-3.2-3B")
14-
model.push("user/my-model", format="gguf", quantization="Q4_K_M")
13+
model = turbo(
14+
"meta-llama/Llama-3.2-3B",
15+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
16+
)
17+
model.push("user/my-model")
1518

1619
# Method 2: QuantLLMHubManager (Advanced)
1720
manager = QuantLLMHubManager("user/my-model", hf_token="hf_...")
@@ -30,7 +33,7 @@ def push(
3033
self,
3134
repo_id: str,
3235
token: Optional[str] = None,
33-
format: str = "safetensors",
36+
format: Optional[str] = None,
3437
quantization: Optional[str] = None,
3538
license: str = "apache-2.0",
3639
commit_message: str = "Upload model via QuantLLM",
@@ -44,7 +47,7 @@ def push(
4447
|-----------|------|---------|-------------|
4548
| `repo_id` | str | required | HuggingFace repo ID (user/model) |
4649
| `token` | str | None | HF token (or use HF_TOKEN env) |
47-
| `format` | str | "safetensors" | Export format |
50+
| `format` | str | None | Export format (uses `config["push_format"]` when omitted) |
4851
| `quantization` | str | None | Quantization type |
4952
| `license` | str | "apache-2.0" | License type |
5053

@@ -62,13 +65,14 @@ def push(
6265
```python
6366
from quantllm import turbo
6467

65-
model = turbo("meta-llama/Llama-3.2-3B")
68+
model = turbo(
69+
"meta-llama/Llama-3.2-3B",
70+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
71+
)
6672

6773
# Push as GGUF
6874
model.push(
69-
"your-username/llama-3.2-3b-gguf",
70-
format="gguf",
71-
quantization="Q4_K_M"
75+
"your-username/llama-3.2-3b-gguf"
7276
)
7377

7478
# Push as ONNX

docs/api/turbo.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ def turbo(
1414
max_length: Optional[int] = None,
1515
device: Optional[str] = None,
1616
dtype: Optional[str] = None,
17+
config: Optional[Dict[str, Any]] = None,
1718
quantize: bool = True,
1819
trust_remote_code: bool = False,
1920
verbose: bool = True,
@@ -32,6 +33,7 @@ def turbo(
3233
| `max_length` | int | auto | Maximum context length |
3334
| `device` | str | auto | Device ("cuda", "cpu", "cuda:0", "auto") |
3435
| `dtype` | str | auto | Data type ("float16", "bfloat16") |
36+
| `config` | dict | None | Shared export/push defaults (`format`, `quantization`, `push_format`, `push_quantization`) |
3537
| `quantize` | bool | True | Whether to apply quantization |
3638
| `trust_remote_code` | bool | False | Trust remote code in model |
3739
| `verbose` | bool | True | Show loading progress and stats |
@@ -124,7 +126,7 @@ When `verbose=True` (default), you'll see:
124126

125127
```
126128
╔════════════════════════════════════════════════════════════╗
127-
║ 🚀 QuantLLM v2.0.0
129+
║ 🚀 QuantLLM v2.1.0rc1
128130
╚════════════════════════════════════════════════════════════╝
129131

130132
📊 Loading: meta-llama/Llama-3.2-3B

docs/api/turbomodel.md

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -232,23 +232,27 @@ Export the model to various formats.
232232
```python
233233
def export(
234234
self,
235-
format: str,
236-
output_path: str,
235+
format: Optional[str] = None,
236+
output_path: Optional[str] = None,
237237
quantization: Optional[str] = None,
238238
**kwargs
239239
) -> str
240240
```
241241

242242
| Parameter | Type | Description |
243243
|-----------|------|-------------|
244-
| `format` | str | "gguf", "onnx", "mlx", "safetensors" |
245-
| `output_path` | str | Output file or directory |
244+
| `format` | str | "gguf", "onnx", "mlx", "safetensors" (optional, uses shared config) |
245+
| `output_path` | str | Output file or directory (optional) |
246246
| `quantization` | str | Quantization type (format-specific) |
247247

248248
**Examples:**
249249
```python
250250
# GGUF
251-
model.export("gguf", "model.Q4_K_M.gguf", quantization="Q4_K_M")
251+
model = turbo(
252+
"meta-llama/Llama-3.2-3B",
253+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
254+
)
255+
model.export()
252256

253257
# ONNX
254258
model.export("onnx", "./model-onnx/")
@@ -269,7 +273,7 @@ def push(
269273
self,
270274
repo_id: str,
271275
token: Optional[str] = None,
272-
format: str = "safetensors",
276+
format: Optional[str] = None,
273277
quantization: Optional[str] = None,
274278
license: str = "apache-2.0",
275279
commit_message: str = "Upload model via QuantLLM",
@@ -281,9 +285,7 @@ def push(
281285
```python
282286
# Push as GGUF
283287
model.push(
284-
"your-username/my-model",
285-
format="gguf",
286-
quantization="Q4_K_M"
288+
"your-username/my-model"
287289
)
288290

289291
# Push as MLX

docs/conf.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
project = 'QuantLLM'
44
copyright = '2024, Dark Coder'
55
author = 'Dark Coder'
6-
release = '2.0.0'
6+
release = '2.1.0rc1'
77

88
# Extensions
99
extensions = [
@@ -21,7 +21,7 @@
2121
# HTML output
2222
html_theme = 'sphinx_rtd_theme'
2323
html_static_path = ['_static']
24-
html_title = 'QuantLLM v2.0'
24+
html_title = 'QuantLLM v2.1'
2525
html_logo = 'images/logo.png'
2626
html_favicon = 'images/favicon.ico'
2727

docs/guide/finetuning.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -193,13 +193,13 @@ print("Fine-tuned:", model.generate("prompt"))
193193

194194
```python
195195
# Export to GGUF
196-
model.export("gguf", "finetuned.Q4_K_M.gguf", quantization="Q4_K_M")
196+
model.export("gguf", "finetuned.Q4_K_M.gguf")
197197

198198
# Export to SafeTensors
199199
model.export("safetensors", "./finetuned-model/")
200200

201201
# Push to HuggingFace
202-
model.push("your-username/finetuned-model", format="gguf")
202+
model.push("your-username/finetuned-model")
203203
```
204204

205205
### Save and Load

docs/guide/gguf-export.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,12 @@ print(output["choices"][0]["text"])
130130
Export and push in one step:
131131

132132
```python
133+
model = turbo(
134+
"meta-llama/Llama-3.2-3B",
135+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
136+
)
133137
model.push(
134138
"your-username/my-model-gguf",
135-
format="gguf",
136-
quantization="Q4_K_M",
137139
license="apache-2.0"
138140
)
139141
```

docs/guide/hub-integration.md

Lines changed: 15 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,15 @@ The easiest way to share your model:
1111
```python
1212
from quantllm import turbo
1313

14-
model = turbo("meta-llama/Llama-3.2-3B")
14+
model = turbo(
15+
"meta-llama/Llama-3.2-3B",
16+
config={"format": "gguf", "quantization": "Q4_K_M", "push_format": "gguf"},
17+
)
1518

1619
# Push with auto-generated model card
1720
model.push(
1821
"your-username/my-model",
19-
token="hf_...",
20-
format="gguf",
21-
quantization="Q4_K_M"
22+
token="hf_..."
2223
)
2324
```
2425

@@ -49,34 +50,18 @@ model.push("user/repo", token="hf_...")
4950
```python
5051
from quantllm import turbo
5152

52-
model = turbo("meta-llama/Llama-3.2-3B")
53-
54-
# Push as GGUF (for Ollama, llama.cpp, LM Studio)
55-
model.push(
56-
"your-username/my-model-gguf",
57-
format="gguf",
58-
quantization="Q4_K_M",
59-
license="apache-2.0"
53+
model = turbo(
54+
"meta-llama/Llama-3.2-3B",
55+
config={
56+
"format": "gguf",
57+
"quantization": "Q4_K_M",
58+
"push_format": "gguf",
59+
},
6060
)
6161

62-
# Push as ONNX
63-
model.push(
64-
"your-username/my-model-onnx",
65-
format="onnx"
66-
)
67-
68-
# Push as MLX (Apple Silicon)
69-
model.push(
70-
"your-username/my-model-mlx",
71-
format="mlx",
72-
quantization="4bit"
73-
)
74-
75-
# Push as SafeTensors (default)
76-
model.push(
77-
"your-username/my-model",
78-
format="safetensors"
79-
)
62+
# Uses shared config defaults
63+
model.export()
64+
model.push("your-username/my-model-gguf", license="apache-2.0")
8065
```
8166

8267
### Method 2: QuantLLMHubManager (Advanced)

0 commit comments

Comments
 (0)