diff --git a/configs/qwen2_5/fp8_dynamic/qwen2_5-14b_instruct_fp8_dynamic.yaml b/configs/qwen2_5/fp8_dynamic/qwen2_5-14b_instruct_fp8_dynamic.yaml new file mode 100644 index 00000000..376b803b --- /dev/null +++ b/configs/qwen2_5/fp8_dynamic/qwen2_5-14b_instruct_fp8_dynamic.yaml @@ -0,0 +1,26 @@ +# Global configuration of pipeline +global: + save_path: ./output + +# Simplified Configuration for LLM compression +model: + name: Qwen + model_path: Qwen/Qwen2.5-14B-Instruct + trust_remote_code: true + low_cpu_mem_usage: true + use_cache: false + torch_dtype: auto + device_map: auto + +# Compression configuration +compression: + name: PTQ + quantization: + name: fp8_dynamic # Supported: fp8_static, fp8_dynamic, int4_awq, int4_gptq + bits: 8 # Quantization bits (4/8) + quant_method: + weight: "per-tensor" + activation: "per-tensor" + ignore_layers: # Skip quantization for these layers + - "lm_head" + - "model.embed_tokens" diff --git a/configs/qwen2_5/fp8_static/qwen2_5-14b_instruct_fp8_static.yaml b/configs/qwen2_5/fp8_static/qwen2_5-14b_instruct_fp8_static.yaml index 48bf2900..7353bc93 100644 --- a/configs/qwen2_5/fp8_static/qwen2_5-14b_instruct_fp8_static.yaml +++ b/configs/qwen2_5/fp8_static/qwen2_5-14b_instruct_fp8_static.yaml @@ -5,7 +5,7 @@ global: # Simplified Configuration for LLM compression model: name: Qwen - model_path: Qwen/Qwen2.5-Coder-14B-Instruct + model_path: Qwen/Qwen2.5-14B-Instruct trust_remote_code: true low_cpu_mem_usage: true use_cache: false