-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathawquant.py
More file actions
32 lines (24 loc) · 951 Bytes
/
Copy pathawquant.py
File metadata and controls
32 lines (24 loc) · 951 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
"""
Activation-aware Weight Quantization (AWQ) algorithm for quantizing LLMs
"""
import os
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer
model_path = os.environ.get('OUTPUT_MODEL_NAME', "assistant-mistral-7b-dolphin-2.2.1")
tokenizer_path = model_path #os.environ.get('BASE_MODEL_NAME', "hf-internal-testing/llama-tokenizer")
print(f"Taking full precision weigths and biases from {model_path}.")
quant_path = f'{model_path}-awq'
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4 }
print("Exporting:")
print(f"{quant_config=}")
print(f"{quant_path=}")
# Load model
print("Loading model")
model = AutoAWQForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True)
# Quantize
model.quantize(tokenizer, quant_config=quant_config)
# Save quantized model
model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)
print("Done.")