Skip to content

Commit 31f246b

Browse files
committed
Disable KV cache quantization
1 parent da8cc24 commit 31f246b

1 file changed

Lines changed: 14 additions & 1 deletion

File tree

  • examples/llm_compression/openvino/tiny_llama_synthetic_data

examples/llm_compression/openvino/tiny_llama_synthetic_data/main.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,22 @@ def transform_func(text, tokenizer):
4141

4242
def main():
4343
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
44+
ov_config = {
45+
"PERFORMANCE_HINT": "LATENCY",
46+
"NUM_STREAMS": "1",
47+
"CACHE_DIR": "",
48+
"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0",
49+
"KV_CACHE_PRECISION": "f16",
50+
}
4451

4552
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
46-
hf_model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, compile=False)
53+
hf_model = OVModelForCausalLM.from_pretrained(
54+
MODEL_ID,
55+
export=True,
56+
load_in_8bit=False,
57+
compile=False,
58+
ov_config=ov_config,
59+
)
4760

4861
dataset_size = 100
4962

0 commit comments

Comments
 (0)