|
1 | 1 | """ |
2 | | -QuantLLM v2.0 - GGUF Export Example |
| 2 | +QuantLLM v2.1 - GGUF Export Example |
3 | 3 |
|
4 | 4 | Export models to GGUF format for use with llama.cpp, Ollama, LM Studio. |
5 | 5 | No external dependencies required! |
6 | 6 | """ |
7 | 7 |
|
8 | | -from quantllm import turbo, list_quant_types |
| 8 | +from quantllm import turbo, GGUF_QUANT_TYPES, QUANT_RECOMMENDATIONS |
9 | 9 |
|
10 | 10 | # ============================================ |
11 | 11 | # Show Available Quantization Types |
12 | 12 | # ============================================ |
13 | 13 | print("📦 Available quantization types:\n") |
14 | | -for name, desc in list_quant_types().items(): |
15 | | - print(f" {name:12} - {desc}") |
| 14 | +for qt in GGUF_QUANT_TYPES: |
| 15 | + print(f" {qt}") |
| 16 | + |
| 17 | +print("\n📦 Recommended quantization types:\n") |
| 18 | +for use_case, qt in QUANT_RECOMMENDATIONS.items(): |
| 19 | + print(f" {use_case:12} → {qt}") |
16 | 20 |
|
17 | 21 | # ============================================ |
18 | 22 | # Load Model |
|
24 | 28 | # Export to GGUF |
25 | 29 | # ============================================ |
26 | 30 |
|
27 | | -# Option 1: Quick export (default q4_0) |
28 | | -print("\n🚀 Exporting to GGUF (q4_0)...") |
29 | | -model.export("gguf", "tinyllama-q4.gguf") |
30 | | - |
31 | | -# Option 2: High quality (q8_0) |
32 | | -print("\n🚀 Exporting to GGUF (q8_0)...") |
33 | | -model.export("gguf", "tinyllama-q8.gguf", quantization="q8_0") |
34 | | - |
35 | | -# Option 3: Half precision (f16) |
36 | | -print("\n🚀 Exporting to GGUF (f16)...") |
37 | | -model.export("gguf", "tinyllama-f16.gguf", quantization="f16") |
38 | | - |
39 | | -# ============================================ |
40 | | -# Using convert_to_gguf Directly |
41 | | -# ============================================ |
42 | | -from quantllm import convert_to_gguf |
43 | | -from transformers import AutoModelForCausalLM, AutoTokenizer |
44 | | - |
45 | | -print("\n🔧 Using convert_to_gguf directly...") |
| 31 | +# Option 1: Quick export (default Q4_K_M) |
| 32 | +print("\n🚀 Exporting to GGUF (Q4_K_M)...") |
| 33 | +model.export("gguf", "tinyllama-q4.gguf", quantization="Q4_K_M") |
46 | 34 |
|
47 | | -# Load with transformers |
48 | | -hf_model = AutoModelForCausalLM.from_pretrained( |
49 | | - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", |
50 | | - torch_dtype="auto" |
51 | | -) |
52 | | -tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0") |
| 35 | +# Option 2: High quality (Q8_0) |
| 36 | +print("\n🚀 Exporting to GGUF (Q8_0)...") |
| 37 | +model.export("gguf", "tinyllama-q8.gguf", quantization="Q8_0") |
53 | 38 |
|
54 | | -# Convert |
55 | | -convert_to_gguf( |
56 | | - model=hf_model, |
57 | | - tokenizer=tokenizer, |
58 | | - output_path="tinyllama-direct.gguf", |
59 | | - quant_type="q4_0", |
60 | | - verbose=True |
61 | | -) |
| 39 | +# Option 3: Half precision (F16) |
| 40 | +print("\n🚀 Exporting to GGUF (F16)...") |
| 41 | +model.export("gguf", "tinyllama-f16.gguf", quantization="F16") |
62 | 42 |
|
63 | 43 | print("\n✅ All exports complete!") |
64 | 44 | print("\nUse these files with:") |
65 | | -print(" - llama.cpp: ./main -m tinyllama-q4.gguf") |
| 45 | +print(" - llama.cpp: ./llama-cli -m tinyllama-q4.gguf -p 'Hello!'") |
66 | 46 | print(" - Ollama: ollama create mymodel -f Modelfile") |
67 | 47 | print(" - LM Studio: Import the .gguf file") |
0 commit comments